Part1-Q2:EDA¶
Part1-Q2: Setup¶
1. Setup and Load Data¶
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import ast # For safely evaluating string representations of lists/dicts if needed
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
# Set plot style
sns.set_theme(style="whitegrid")
# Define file paths
INPUT_PATH = "Project_Data_Enriched_Combined/meta_data_enriched_all_v3.csv"
FILTERED_OUTPUT_PATH = "EDA_2698.csv"
# Load the dataset
try:
df_full = pd.read_csv(INPUT_PATH)
print(f"Successfully loaded data from {INPUT_PATH}")
print(f"Original shape: {df_full.shape}")
except FileNotFoundError:
print(f"Error: File not found at {INPUT_PATH}")
exit()
except Exception as e:
print(f"Error loading data: {e}")
exit()
# Display basic info and first few rows
print("\nOriginal Data Info:")
df_full.info()
print("\nOriginal Data Head:")
print(df_full.head())
Successfully loaded data from Project_Data_Enriched_Combined/meta_data_enriched_all_v3.csv
Original shape: (4646, 33)
Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4646 entries, 0 to 4645
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ProjID 2698 non-null float64
1 ProjectStatus 2725 non-null object
2 ProjectTitle 2698 non-null object
3 ProjectRDC 2935 non-null object
4 ProjectYearStarted 2698 non-null float64
5 ProjectYearEnded 1572 non-null float64
6 ProjectPI 3963 non-null object
7 OutputTitle 4646 non-null object
8 OutputBiblio 4646 non-null object
9 OutputType 4646 non-null object
10 OutputStatus 4646 non-null object
11 OutputVenue 3873 non-null object
12 OutputYear 4646 non-null int64
13 OutputMonth 4644 non-null float64
14 OutputVolume 2993 non-null object
15 OutputNumber 2736 non-null object
16 OutputPages 4644 non-null object
17 SourceFile 4646 non-null object
18 DOI 4646 non-null object
19 Keywords 4639 non-null object
20 Authors 4640 non-null object
21 RawAuthorNames 4640 non-null object
22 Affiliations 4358 non-null object
23 RawAffiliations 4358 non-null object
24 IsFSRDCRelated 4646 non-null bool
25 MatchingFSRDCKeywords 4646 non-null object
26 _match_score 4646 non-null int64
27 _metadata_source 4646 non-null object
28 _error 0 non-null float64
29 IsEnrichedByMetadata1 4646 non-null bool
30 IsEnrichedByMetadata2 4646 non-null bool
31 IsEnriched 4646 non-null bool
32 IsEnrichedByMetadata3 4646 non-null bool
dtypes: bool(5), float64(5), int64(2), object(21)
memory usage: 1.0+ MB
Original Data Head:
ProjID ProjectStatus ProjectTitle \
0 1774.0 Completed Measuring the End-Use of U.S. Traded Goods
1 NaN NaN NaN
2 NaN NaN NaN
3 NaN NaN NaN
4 595.0 Completed Household Mobility and Environmental Health
ProjectRDC ProjectYearStarted ProjectYearEnded ProjectPI \
0 Kansas City 2017.0 2022.0 Nicholas Sly
1 NaN NaN NaN Lucas W Davis
2 NaN NaN NaN Lucas W Davis
3 NaN NaN NaN Lucas W Davis
4 Michigan 2007.0 2012.0 Lucas W Davis
OutputTitle \
0 From Selling Goods to Selling Services: Firm R...
1 Contribution of air conditioning adoption to f...
2 Cash for Coolers: Evaluating a Large-Scale App...
3 The Distributional Effects of US Clean Energy ...
4 Estimating the effect of a gasoline tax on car...
OutputBiblio OutputType ... \
0 Breinlich, H., Soderbery, A., & Wright, G. (20... MI ...
1 Davis, L.W. & Gertler, P. (2015). Contribution... MI ...
2 Davis, L.W., Fuchs, A., & Gertler, P. (2014). ... MI ...
3 Borenstein, S. & Davis, L.W. (2016). The Distr... MI ...
4 Davis, L.W. & Kilian, L. (2010). Estimating th... MI ...
RawAffiliations IsFSRDCRelated \
0 Purdue University West Lafayette; Purdue Unive... True
1 Haas School of Business, University of Califor... True
2 Haas School of Business, University of Califor... True
3 University of California at Berkeley and NBER;... True
4 Centre for Economic Policy Research; Centre fo... True
MatchingFSRDCKeywords _match_score \
0 cepr 100
1 microdata, national bureau of economic research 100
2 national bureau of economic research 100
3 nber 100
4 national bureau of economic research 100
_metadata_source _error IsEnrichedByMetadata1 IsEnrichedByMetadata2 \
0 openalex NaN False False
1 openalex NaN False False
2 openalex NaN False False
3 openalex NaN False False
4 openalex NaN True False
IsEnriched IsEnrichedByMetadata3
0 True True
1 False False
2 False False
3 False False
4 True False
[5 rows x 33 columns]
2. Filter Enriched Rows and Save(ONLY want the 2698 matched records with Project-level data)¶
- for further analysis
- and for later Data Science questions
# Filter the DataFrame to keep only rows where IsEnriched is True
df_enriched = df_full[df_full["IsEnriched"] == True].copy() # Use .copy() to avoid SettingWithCopyWarning
print(f"\nFiltered shape (enriched rows only): {df_enriched.shape}")
# Verify the count matches the expected 2698
if df_enriched.shape[0] == 2698:
print("Filtered row count matches the expected 2698.")
else:
print(
f"Warning: Filtered row count is {df_enriched.shape[0]}, expected 2698. Please check the 'IsEnriched' column."
)
# Convert float columns to integer
print("\nConverting float columns to integer...")
for col in df_enriched.columns:
if pd.api.types.is_float_dtype(df_enriched[col]):
try:
# Convert empty values to NA
df_enriched[col] = df_enriched[col].replace(['', 'None', None], np.nan)
# Check if non-NA values are all integers
non_na_values = df_enriched[col].dropna()
if len(non_na_values) > 0:
if non_na_values.apply(lambda x: x.is_integer()).all():
# If all are integers, convert directly
df_enriched[col] = df_enriched[col].astype('Int64')
print(f"Column '{col}' converted to integer type (preserving NA values)")
else:
# If contains non-integer values, round first
df_enriched[col] = df_enriched[col].round().astype('Int64')
print(f"Column '{col}' contains non-integer values, rounded and converted to integer type")
else:
# If all values are NA, convert to Int64 type
df_enriched[col] = df_enriched[col].astype('Int64')
print(f"Column '{col}' contains only NA values, converted to integer type")
except Exception as e:
print(f"Error converting column '{col}': {e}")
continue
# Save the filtered data
try:
df_enriched.to_csv(FILTERED_OUTPUT_PATH, index=False)
print(f"Successfully saved filtered data to {FILTERED_OUTPUT_PATH}")
except Exception as e:
print(f"Error saving filtered data: {e}")
# Use df_enriched for all subsequent analysis
df = df_enriched
Filtered shape (enriched rows only): (2698, 33) Filtered row count matches the expected 2698. Converting float columns to integer... Column 'ProjID' converted to integer type (preserving NA values) Column 'ProjectYearStarted' converted to integer type (preserving NA values) Column 'ProjectYearEnded' converted to integer type (preserving NA values) Column 'OutputMonth' converted to integer type (preserving NA values) Column '_error' contains only NA values, converted to integer type Successfully saved filtered data to EDA_2698.csv
3. Part1-Q2.1: Top 10 RDCs by Research Output Count¶
- notice these are the records besides the ResearchOutputs.xlsx, which are all new
import plotly.express as px
print("\n--- EDA: Top 10 RDCs by Output Count (Improved Plotly Version) ---")
if "ProjectRDC" in df.columns:
# Calculate the top 10 RDCs by number of outputs
rdc_counts = df["ProjectRDC"].value_counts().nlargest(10)
print("Top 10 RDCs by Number of Research Outputs:")
print(rdc_counts)
# Create a more colorful bar chart using Plotly
fig = px.bar(
x=rdc_counts.values,
y=rdc_counts.index,
orientation="h", # Horizontal bar chart
labels={"x": "Number of Outputs", "y": "RDC"},
title="Top 10 RDCs by Number of Research Outputs",
template="plotly_white", # Clean background
text=rdc_counts.values, # Show output numbers on bars
color=rdc_counts.values, # Color based on the number of outputs
color_continuous_scale="Viridis" # Use a colorful gradient
)
# Move the text outside the bars
fig.update_traces(textposition="outside")
# Reverse the y-axis to match the original order (top to bottom)
fig.update_layout(
yaxis=dict(autorange="reversed"),
coloraxis_showscale=False, # Hide color scale bar
plot_bgcolor="rgba(0,0,0,0)", # Transparent background
xaxis_title="Number of Research Outputs",
yaxis_title="Research Data Center (RDC)",
font=dict(size=14)
)
fig.show()
else:
print("Column 'ProjectRDC' not found in the dataset.")
--- EDA: Top 10 RDCs by Output Count (Improved Plotly Version) --- Top 10 RDCs by Number of Research Outputs: ProjectRDC Michigan 342 Boston 323 Chicago 239 Triangle 179 Baruch 178 Atlanta 136 Fed Board 118 Texas 103 Minnesota 100 UCLA 91 Name: count, dtype: int64
Q2.1 Analysis: Top 10 RDCs by Number of Research Outputs¶
After analyzing 2,698 research outputs, we identified the top 10 Research Data Centers (RDCs) based on the number of outputs they produced.
The detailed ranking is as follows:
| Rank | RDC Name | Number of Outputs |
|---|---|---|
| 1 | Michigan | 342 |
| 2 | Boston | 323 |
| 3 | Chicago | 239 |
| 4 | Triangle | 179 |
| 5 | Baruch | 178 |
| 6 | Atlanta | 136 |
| 7 | Fed Board | 118 |
| 8 | Texas | 103 |
| 9 | Minnesota | 100 |
| 10 | UCLA | 91 |
Key observations include:
- Michigan RDC leads with the highest number of research outputs (342), significantly ahead of other centers.
- Boston RDC follows closely with 323 outputs, also demonstrating strong research productivity.
- Chicago RDC ranks third, contributing 239 outputs.
- The top five centers (Michigan, Boston, Chicago, Triangle, and Baruch) each produced over 170 outputs, highlighting their consistent research activity.
- Centers ranked sixth through tenth (Atlanta to UCLA) show a moderate but still substantial level of output, ranging between 90 and 140 publications.
Overall, the Michigan and Boston RDCs stand out for their exceptionally high research productivity, while other centers maintain a steady and meaningful contribution.
The accompanying horizontal bar chart clearly illustrates the output distribution, with Michigan and Boston having noticeably longer bars compared to the rest, reflecting their leadership positions in research contributions.
4. Part1-Q2.2: Publications per Year¶
import plotly.express as px
print("\n--- EDA: Publications per Year (Plotly Version) ---")
# Ensure 'OutputYear' column exists
if "OutputYear" in df.columns:
# Convert 'OutputYear' to numeric and drop NaN values
df["OutputYear_numeric"] = pd.to_numeric(df["OutputYear"], errors="coerce")
publications_per_year = (
df.dropna(subset=["OutputYear_numeric"])["OutputYear_numeric"]
.astype(int)
.value_counts()
.sort_index()
)
print("\nNumber of Publications per Year:")
print(publications_per_year)
# Create a more polished line chart using Plotly
fig = px.line(
x=publications_per_year.index,
y=publications_per_year.values,
markers=True,
labels={"x": "Year", "y": "Number of Publications"},
title="Number of Publications per Year",
template="plotly_white",
)
fig.update_traces(line=dict(color="royalblue", width=3), marker=dict(size=8))
fig.update_layout(
xaxis_tickangle=-45,
plot_bgcolor="rgba(0,0,0,0)",
font=dict(size=14),
)
fig.show()
else:
print("Column 'OutputYear' not found or contains non-numeric data.")
--- EDA: Publications per Year (Plotly Version) --- Number of Publications per Year: OutputYear_numeric 2000 1 2001 5 2002 9 2003 20 2004 31 2005 20 2006 21 2007 37 2008 30 2009 64 2010 84 2011 65 2012 65 2013 69 2014 93 2015 140 2016 151 2017 121 2018 213 2019 196 2020 241 2021 255 2022 259 2023 258 2024 196 2025 54 Name: count, dtype: int64
Q2.2 Analysis: Number of Publications per Year¶
The trend of the number of research outputs from 2000 to 2025 shows several distinct phases:
- 2000–2007: The number of publications increased steadily but remained relatively low, starting from just 1 publication in 2000 and reaching 37 in 2007.
- 2008–2013: Moderate growth was observed during this period, with annual outputs fluctuating between 30 and 93 publications per year.
- 2014–2017: A significant acceleration occurred, as the number of publications grew rapidly from 93 in 2014 to a peak of 151 in 2016.
- 2018–2022: This period marked the highest productivity, with outputs exceeding 190 publications each year and peaking at over 250 publications in 2021 and 2022.
- 2023–2025: A noticeable decline is observed after 2022, with the number of publications dropping to 196 in 2024 and only 54 in 2025.
This sharp decrease in 2025 is likely due to data incompleteness, as 2025 may not have finished at the time of data collection.
Key Insights:¶
- The overall trend indicates strong growth in research output over the last two decades, especially after 2015.
- The peak years (2020–2022) suggest a maturation of research activities and possibly greater utilization of FSRDC resources.
- The recent decline (2024–2025) should be interpreted cautiously, as it may reflect incomplete reporting rather than an actual drop in research productivity.
The line chart clearly visualizes these phases, highlighting both the rapid expansion phase and the potential data artifact in the most recent years.
5. Part1-Q2.3: Top 10 Most Prolific Authors¶
import plotly.express as px
from collections import Counter
import ast
print("\n--- EDA: Top 10 Most Prolific Authors (Plotly Version) ---")
# Helper function to extract authors
def get_all_authors_from_row(row):
authors = set()
if pd.notna(row["Authors"]):
try:
authors_data = row["Authors"]
if isinstance(authors_data, str) and authors_data.startswith("["):
try:
author_list = ast.literal_eval(authors_data)
if isinstance(author_list, list):
authors.update([str(name).strip() for name in author_list if str(name).strip()])
else:
authors.update([name.strip() for name in authors_data.split(";") if name.strip()])
except (ValueError, SyntaxError):
authors.update([name.strip() for name in authors_data.split(";") if name.strip()])
elif isinstance(authors_data, str):
authors.update([name.strip() for name in authors_data.split(";") if name.strip()])
elif isinstance(authors_data, list):
authors.update([str(name).strip() for name in authors_data if str(name).strip()])
except Exception:
pass
return list(authors)
# Aggregate all authors
all_authors_list = []
for index, row in df.iterrows():
all_authors_list.extend(get_all_authors_from_row(row))
# Count the number of publications per author
author_counts = Counter(all_authors_list)
# Get the top 10 most prolific authors
top_10_authors = author_counts.most_common(10)
print("\nTop 10 Most Prolific Authors:")
for author, count in top_10_authors:
print(f"- {author}: {count} publications")
# Plotly visualization
if top_10_authors:
top_authors_df = pd.DataFrame(top_10_authors, columns=["Author", "Count"])
fig = px.bar(
top_authors_df,
x="Count",
y="Author",
orientation="h",
color="Count",
color_continuous_scale="Magma", # Use a colorful gradient
labels={"Count": "Number of Publications", "Author": "Author"},
title="Top 10 Most Prolific Authors",
template="plotly_white"
)
fig.update_traces(texttemplate="%{x}", textposition="outside")
fig.update_layout(
yaxis=dict(autorange="reversed"),
font=dict(size=14),
plot_bgcolor="rgba(0,0,0,0)",
coloraxis_showscale=False
)
fig.show()
--- EDA: Top 10 Most Prolific Authors (Plotly Version) --- Top 10 Most Prolific Authors: - John Haltiwanger: 74 publications - Nathan Goldschlag: 55 publications - Lucia Foster: 52 publications - Nikolas Zolas: 44 publications - Javier Miranda: 44 publications - Scott H. Holan: 43 publications - Emin Dinlersoz: 39 publications - John V. Winters: 37 publications - Jerome P. Reiter: 36 publications - Katie R. Genadek: 33 publications
Q2.3 Analysis: Top 10 Most Prolific Authors¶
Based on the publication data, the top 10 most prolific authors are identified as follows:
| Rank | Author | Number of Publications |
|---|---|---|
| 1 | John Haltiwanger | 74 |
| 2 | Nathan Goldschlag | 55 |
| 3 | Lucia Foster | 52 |
| 4 | Nikolas Zolas | 44 |
| 5 | Javier Miranda | 44 |
| 6 | Scott H. Holan | 43 |
| 7 | Emin Dinlersoz | 39 |
| 8 | John V. Winters | 37 |
| 9 | Jerome P. Reiter | 36 |
| 10 | Katie R. Genadek | 33 |
Key Observations:¶
- John Haltiwanger is the most prolific author, contributing to 74 research outputs, significantly ahead of the second-ranked author.
- Nathan Goldschlag and Lucia Foster also demonstrate strong research activity, with 55 and 52 publications respectively.
- Several authors, including Nikolas Zolas and Javier Miranda, are tied closely with 44 publications each.
- The overall distribution shows a relatively gradual decrease in the number of publications from the top author down to the tenth, indicating a strong and active group of researchers leading FSRDC-related work.
The horizontal bar chart clearly visualizes the dominance of the top contributors, with John Haltiwanger standing out significantly compared to his peers.
6. Part1-Q2.4: Citation Insights¶
- Fetch citation counts from OpenAlex using DOIs(We didn't have these in the enriched final dataset, and only for this step we enrich again)
- refer to fetch_citations_by_title.py
Load the Citation-Enriched Data¶
# --- Load Citation Enriched Data ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set plot style
sns.set_theme(style="whitegrid")
CITATION_DATA_PATH = "EDA_2698_with_citations.csv"
try:
df_citations = pd.read_csv(CITATION_DATA_PATH)
print(f"Successfully loaded data with citations from {CITATION_DATA_PATH}")
print(f"Shape: {df_citations.shape}")
# Use this DataFrame for citation analysis
df = df_citations.copy() # Make a copy to work with
except FileNotFoundError:
print(
f"Error: File not found at {CITATION_DATA_PATH}. Please ensure fetch_citations_by_title.py ran successfully."
)
# Stop execution or handle error appropriately in the notebook
exit()
except Exception as e:
print(f"Error loading citation data: {e}")
exit()
# Display info to confirm the new column
print("\nData Info (with Citations):")
df.info()
print("\nCitation Column Head:")
print(df[["OutputTitle", "OutputCitationCount"]].head())
Successfully loaded data with citations from EDA_2698_with_citations.csv
Shape: (2698, 34)
Data Info (with Citations):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2698 entries, 0 to 2697
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 2698 non-null int64
1 ProjID 2698 non-null int64
2 ProjectStatus 2698 non-null object
3 ProjectTitle 2698 non-null object
4 ProjectRDC 2698 non-null object
5 ProjectYearStarted 2698 non-null int64
6 ProjectYearEnded 2698 non-null int64
7 ProjectPI 2698 non-null object
8 OutputTitle 2698 non-null object
9 OutputBiblio 2698 non-null object
10 OutputType 2698 non-null object
11 OutputStatus 2698 non-null object
12 OutputVenue 2210 non-null object
13 OutputYear 2698 non-null int64
14 OutputMonth 2698 non-null int64
15 OutputVolume 1675 non-null object
16 OutputNumber 1495 non-null object
17 OutputPages 2696 non-null object
18 SourceFile 2698 non-null object
19 DOI 2698 non-null object
20 Keywords 2692 non-null object
21 Authors 2698 non-null object
22 RawAuthorNames 2698 non-null object
23 Affiliations 2545 non-null object
24 RawAffiliations 2545 non-null object
25 IsFSRDCRelated 2698 non-null bool
26 MatchingFSRDCKeywords 2698 non-null object
27 _match_score 2698 non-null int64
28 _metadata_source 2698 non-null object
29 IsEnrichedByMetadata1 2698 non-null bool
30 IsEnrichedByMetadata2 2698 non-null bool
31 IsEnriched 2698 non-null bool
32 IsEnrichedByMetadata3 2698 non-null bool
33 OutputCitationCount 2698 non-null int64
dtypes: bool(5), int64(8), object(21)
memory usage: 624.6+ KB
Citation Column Head:
OutputTitle OutputCitationCount
0 From Selling Goods to Selling Services: Firm R... 37
1 Estimating the effect of a gasoline tax on car... 228
2 Evaluating the Slow Adoption of Energy Efficie... 180
3 Anticipation, Tax Avoidance, and the Price Ela... 125
4 Market Impacts of a Nuclear Power Plant Closure 101
Q2.4.1 Basic Citation Statistics and Distribution¶
# --- Basic Citation Analysis ---
print("\n--- EDA: Basic Citation Statistics & Distribution ---")
citation_col = "OutputCitationCount"
# Ensure column exists
if citation_col not in df.columns:
print(f"Error: Column '{citation_col}' not found.")
else:
# Convert to numeric, coercing errors. Fill NaN with 0 for some analyses, but keep track of original NaNs.
df[citation_col + "_numeric"] = pd.to_numeric(df[citation_col], errors="coerce")
original_nan_count = df[citation_col + "_numeric"].isna().sum()
valid_citations = df[citation_col + "_numeric"].dropna()
df_filled_citations = df.fillna(
{citation_col + "_numeric": 0}
) # Create a version with NaNs filled for easier aggregation
print(f"Number of records analyzed: {len(df)}")
print(f"Number of records with valid citation counts: {len(valid_citations)}")
print(f"Number of records with missing citation counts (originally NaN): {original_nan_count}")
if not valid_citations.empty:
print("\nDescriptive Statistics for Valid Citation Counts:")
# Use .describe() on the series with NaNs dropped
print(valid_citations.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))
# --- Visualization: Distribution (Plotly) ---
import plotly.express as px
# --- Prepare citation data (keeps your earlier steps) -----------------
citation_col = "OutputCitationCount"
df[citation_col + "_numeric"] = pd.to_numeric(df[citation_col], errors="coerce")
valid_citations = df[citation_col + "_numeric"].dropna()
# ───────────────────────────────────────────────────────────────────────
# 1) RAW-SCALE HISTOGRAM (px still works fine here)
# ───────────────────────────────────────────────────────────────────────
fig_raw = px.histogram(
valid_citations,
nbins=50,
title="Distribution of Citation Counts",
labels={"value": "Number of Citations", "count": "Number of Publications"},
template="plotly_white"
)
fig_raw.update_traces(marker_color="steelblue", opacity=0.8)
fig_raw.update_layout(bargap=0.05)
fig_raw.show()
# ---------------------------------------------------------------------
# 2) LOG-SCALE VERSION — WORKAROUND
# • transform data: log10(x + 1)
# • plot on a linear axis
# ---------------------------------------------------------------------
log_vals = np.log10(valid_citations + 1)
fig_log = px.histogram(
log_vals,
nbins=50,
title="Distribution of Citation Counts (log10 scale)",
labels={
"value": "log10(Number of Citations + 1)",
"count": "Number of Publications"
},
template="plotly_white"
)
fig_log.update_traces(marker_color="indianred", opacity=0.8)
fig_log.update_layout(bargap=0.05)
fig_log.show()
# ───────────────────────────────────────────────────────────────────────
# 3) BOX PLOT (unchanged px)
# ───────────────────────────────────────────────────────────────────────
fig_box = px.box(
valid_citations,
title="Box Plot of Citation Counts",
labels={"value": "Number of Citations"},
template="plotly_white"
)
fig_box.update_traces(marker_color="darkorange", line_color="black",
boxpoints="outliers")
fig_box.show()
else:
print("No valid citation data found to analyze.")
--- EDA: Basic Citation Statistics & Distribution --- Number of records analyzed: 2698 Number of records with valid citation counts: 2698 Number of records with missing citation counts (originally NaN): 0 Descriptive Statistics for Valid Citation Counts: count 2698.000000 mean 111.227205 std 358.931144 min 0.000000 25% 5.000000 50% 25.000000 75% 87.750000 90% 242.300000 95% 464.150000 99% 1475.060000 max 10952.000000 Name: OutputCitationCount_numeric, dtype: float64
Q2.4.1 Findings: Citation Patterns of FSRDC Outputs¶
After analyzing citation counts for 2,698 FSRDC-related publications, several key insights emerge:
Overall Skewness
- The distribution is highly right-skewed: although the mean citation count is 111.2, the median is only 25, indicating that most papers receive relatively few citations while a small number achieve very high impact.
Concentration of Low-Cited Works
- 25% of publications have 5 or fewer citations.
- 50% (median) have 25 or fewer citations.
- 75% have 87.75 or fewer citations.
Heavy Tail of High-Cited Papers
- The 90th percentile sits at 242 citations, the 95th percentile at 464, and the 99th percentile at 1,475.
- The maximum citation count is 10,952, highlighting a small elite group of exceptionally influential studies.
Implications
- The majority of FSRDC outputs achieve modest visibility, while a few “blockbuster” papers drive overall citation metrics.
- This long-tail pattern suggests that targeted support for high-potential research (e.g., through collaborative projects or dissemination efforts) may amplify overall impact.
Next Steps
- Investigate characteristics (authors, topics, venues, RDC centers) of top-cited papers to understand drivers of influence.
- Explore strategies to raise the citation profile of mid-range works (e.g., open access, wider dissemination).
These findings underscore that while FSRDC outputs collectively contribute to the literature, a small fraction of publications account for a disproportionate share of citations.
Q2.4.2 Citations Over Time¶
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# --- Prepare Data for Time Series Analysis ---
citation_col = "OutputCitationCount_numeric"
df["OutputYear_numeric"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df_time = df.dropna(subset=["OutputYear_numeric", citation_col]).copy()
df_time["Year"] = df_time["OutputYear_numeric"].astype(int)
# Compute metrics by year
mean_cites = df_time.groupby("Year")[citation_col].mean()
median_cites = df_time.groupby("Year")[citation_col].median()
pub_counts = df_time["Year"].value_counts().sort_index()
# --- Plotly Figure: Citations Over Time with Dual Y-Axes ---
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Mean citations trace
fig.add_trace(
go.Scatter(
x=mean_cites.index,
y=mean_cites.values,
mode="lines+markers",
name="Mean Citations",
marker=dict(symbol="circle", size=8, color="firebrick"),
line=dict(color="firebrick", width=2)
),
secondary_y=False
)
# Median citations trace
fig.add_trace(
go.Scatter(
x=median_cites.index,
y=median_cites.values,
mode="lines+markers",
name="Median Citations",
marker=dict(symbol="x", size=10, color="darkorange"),
line=dict(color="darkorange", width=2, dash="dash")
),
secondary_y=False
)
# Publication count trace
fig.add_trace(
go.Scatter(
x=pub_counts.index,
y=pub_counts.values,
mode="lines+markers",
name="Publications Count",
marker=dict(symbol="circle-open", size=6, color="royalblue"),
line=dict(color="royalblue", width=2, dash="dot")
),
secondary_y=True
)
# Update axes
fig.update_xaxes(title_text="Publication Year")
fig.update_yaxes(title_text="Avg / Median Citations", secondary_y=False, color="firebrick")
fig.update_yaxes(title_text="Number of Publications", secondary_y=True, color="royalblue")
# Layout styling
fig.update_layout(
title_text="Citation Trends and Publication Volume Over Time",
template="plotly_white",
legend=dict(x=0.05, y=0.95),
width=900,
height=500
)
fig.show()
Q2.4.2 Analysis: Citations Over Time¶
The time-series analysis of citation counts alongside publication volume reveals these patterns:
Early Years (2000–2004)
- Very few publications in the 2000–2001 period, but those early works accumulate high citations (mean >500, median >180) due to longer exposure time.
Growth Phase (2005–2010)
- Both mean and median citations per year stabilize around 200–400 (mean) and 80–140 (median).
- Publication output rises gradually from under 50 to around 150 outputs per year.
Rapid Expansion (2011–2018)
- Publication volume accelerates sharply—from ~150 in 2011 to over 200 by 2015 and peaking near 260 by 2018.
- Meanwhile, average citations drift downward from ~240 to ~120, and median citations fall below 60, reflecting a dilution effect as more recent papers have had less time to accrue citations.
Maturation & Decline (2019–2025)
- Publication counts remain high (250+ in 2019–2021) before dropping in 2024–2025 (data incomplete for 2025).
- Mean citations continue a gradual decline to under 60 by 2020 and median citations drop to single digits, indicating that newer outputs have not yet built citation momentum.
Overall, the dual‐axis chart highlights how increasing publication volume over time coexists with a decline in per-paper citation averages, driven largely by the citation lag of recent years.
Q2.4.3 Citations by RDC¶
import pandas as pd
import plotly.express as px
# --- Citations by RDC Analysis ---
print("\n--- EDA: Citations by RDC ---")
rdc_col = "ProjectRDC"
citation_col = "OutputCitationCount_numeric"
min_pubs_threshold = 5
top_n = 15
# Assume df and df_filled_citations already exist from previous steps
# df_filled_citations has citation_col filled with 0 for NaNs
# Filter out rows without RDC
df_rdc_analysis = df_filled_citations.dropna(subset=[rdc_col])
# Compute per-RDC metrics
citations_by_rdc_mean = df_rdc_analysis.groupby(rdc_col)[citation_col].mean()
citations_by_rdc_median = df_rdc_analysis.groupby(rdc_col)[citation_col].median()
pubs_by_rdc = df_rdc_analysis[rdc_col].value_counts()
# Build summary DataFrame
rdc_citation_summary = pd.DataFrame({
"MeanCitations": citations_by_rdc_mean,
"MedianCitations": citations_by_rdc_median,
"PublicationCount": pubs_by_rdc
}).sort_values("MedianCitations", ascending=False)
print("\nCitation Summary by RDC (Sorted by Median Citations):")
print(rdc_citation_summary.head(top_n))
# Filter for RDCs with at least min_pubs_threshold publications
rdc_filtered = rdc_citation_summary[rdc_citation_summary["PublicationCount"] >= min_pubs_threshold]
rdc_plot = rdc_filtered.head(top_n)
# --- Plotly Visualization: Top RDCs by Median Citations ---
fig = px.bar(
rdc_plot,
x="MedianCitations",
y=rdc_plot.index,
orientation="h",
color="MedianCitations",
color_continuous_scale="Viridis",
labels={
"y": "RDC",
"MedianCitations": "Median Citations per Publication"
},
title=f"Top {top_n} RDCs by Median Citations (Min {min_pubs_threshold} Publications)",
template="plotly_white",
text="MedianCitations"
)
# Style tweaks
fig.update_traces(textposition="outside")
fig.update_layout(
yaxis=dict(autorange="reversed"),
coloraxis_showscale=False,
xaxis_title="Median Citations per Publication",
width=800,
height=600
)
fig.show()
--- EDA: Citations by RDC ---
Citation Summary by RDC (Sorted by Median Citations):
MeanCitations MedianCitations PublicationCount
ProjectRDC
Washington 243.232877 103.0 73
USC 128.633333 57.5 30
Triangle 149.049724 49.0 181
CensusHQ 47.000000 47.0 1
Berkeley 207.079365 46.0 63
Nebraska 53.600000 45.0 5
UCLA 166.318681 43.0 91
Boston 151.743119 42.0 327
Chicago 107.607759 37.5 232
Stanford 83.786885 34.0 61
Austin 177.000000 32.5 16
Texas 304.000000 32.0 101
Baruch 97.883978 31.0 181
Michigan 95.120588 28.0 340
Wisconsin 86.960000 24.5 50
Q2.4.3 Analysis: Citation Impact by RDC¶
We computed mean and median citation counts per Research Data Center (RDC), then filtered to centers with at least 5 publications and ranked by median citations. The top 15 RDCs are:
| Rank | RDC | Median Citations | Mean Citations | Publication Count |
|---|---|---|---|---|
| 1 | Washington | 103.0 | 243.23 | 73 |
| 2 | USC | 57.5 | 128.63 | 30 |
| 3 | Triangle | 49.0 | 149.05 | 181 |
| 4 | Berkeley | 46.0 | 207.08 | 63 |
| 5 | Nebraska | 45.0 | 53.60 | 5 |
| 6 | UCLA | 43.0 | 166.32 | 91 |
| 7 | Boston | 42.0 | 151.74 | 327 |
| 8 | Chicago | 37.5 | 107.61 | 232 |
| 9 | Stanford | 34.0 | 83.79 | 61 |
| 10 | Austin | 32.5 | 177.00 | 16 |
| 11 | Texas | 32.0 | 304.00 | 101 |
| 12 | Baruch | 31.0 | 97.88 | 181 |
| 13 | Michigan | 28.0 | 95.12 | 340 |
| 14 | Wisconsin | 24.5 | 86.96 | 50 |
| 15 | Kansas City | 24.0 | 104.50 | 23 |
Only RDCs with ≥5 publications are shown.
Key Takeaways¶
- Washington RDC leads by a wide margin (median 103 citations), suggesting consistently high-impact publications.
- USC and Triangle follow, with medians of ~57 and ~49 respectively, indicating strong citation performance relative to their output volume.
- A handful of centers (e.g., Nebraska, Austin) have very small publication counts (just above the threshold) but still achieve high medians—this may reflect a few highly cited papers skewing the median.
- Larger centers by volume (e.g., Michigan, Boston, Chicago) show respectable but lower median citations (28–42), reflecting a broader mix of high- and mid-cited works.
Interpretation¶
- Median citations per center provide a robust measure of typical impact, less sensitive to extreme outliers than the mean.
- Washington’s clear lead suggests a concentration of influential research, whereas high‐volume centers exhibit a wider range of citation performance.
- Future work could examine topics, authorship, or collaboration patterns behind the highest‐median centers to uncover best practices for maximizing research visibility.
Q2.4.4 Citations by Publication Venue¶
import pandas as pd
import plotly.express as px
# --- Citations by Publication Venue ---
print("\n--- EDA: Citations by Publication Venue ---")
venue_col = "OutputVenue"
citation_col = "OutputCitationCount_numeric"
min_pubs_threshold_venue = 3
top_n_venue = 15
# Assume df and df_filled_citations already exist
df_venue_analysis = df_filled_citations.dropna(subset=[venue_col])
# Compute metrics
mean_by_venue = df_venue_analysis.groupby(venue_col)[citation_col].mean()
median_by_venue = df_venue_analysis.groupby(venue_col)[citation_col].median()
counts_by_venue = df_venue_analysis[venue_col].value_counts()
venue_summary = pd.DataFrame({
"MeanCitations": mean_by_venue,
"MedianCitations": median_by_venue,
"PublicationCount": counts_by_venue
}).sort_values("MedianCitations", ascending=False)
print(f"\nTop Venues by Median Citations (Min {min_pubs_threshold_venue} Publications):")
print(venue_summary.head(top_n_venue))
# Filter and select top venues
venue_filtered = venue_summary[
venue_summary["PublicationCount"] >= min_pubs_threshold_venue
].head(top_n_venue)
# --- Plotly Visualization with one-decimal formatting ---
fig = px.bar(
venue_filtered,
x="MedianCitations",
y=venue_filtered.index,
orientation="h",
color="MedianCitations",
color_continuous_scale="Magma",
title=f"Top {top_n_venue} Venues by Median Citations (Min {min_pubs_threshold_venue} Publications)",
labels={"y": "Venue", "MedianCitations": "Median Citations per Publication"},
text="MedianCitations",
template="plotly_white"
)
# Force one decimal place in the on-bar labels
fig.update_traces(texttemplate="%{x:.1f}", textposition="outside")
fig.update_layout(
yaxis=dict(autorange="reversed"),
coloraxis_showscale=False,
xaxis_title="Median Citations per Publication",
width=900,
height=600
)
fig.show()
--- EDA: Citations by Publication Venue ---
Top Venues by Median Citations (Min 3 Publications):
MeanCitations MedianCitations \
OutputVenue
Nature Methods 10952.0 10952.0
Journal of Oral & Facial Pain and Headache 3519.0 3519.0
IEEE Communications Magazine 3394.0 3394.0
Circulation 2496.0 2496.0
Rhinology Journal 2309.0 2309.0
CA A Cancer Journal for Clinicians 1963.5 1963.5
Nature Genetics 1435.5 1435.5
BMJ 1181.0 1181.0
Chapman and Hall/CRC eBooks 885.0 885.0
IEEE Transactions on Image Processing 738.5 738.5
The Permanente Journal 719.0 719.0
Nature Medicine 679.0 679.0
Environmental Health Perspectives 640.0 640.0
Nature 633.0 633.0
ACS Nano 627.0 627.0
PublicationCount
OutputVenue
Nature Methods 1
Journal of Oral & Facial Pain and Headache 1
IEEE Communications Magazine 1
Circulation 1
Rhinology Journal 1
CA A Cancer Journal for Clinicians 2
Nature Genetics 2
BMJ 1
Chapman and Hall/CRC eBooks 1
IEEE Transactions on Image Processing 2
The Permanente Journal 1
Nature Medicine 1
Environmental Health Perspectives 1
Nature 1
ACS Nano 1
Q2.4.4 Analysis: Citation Impact by Publication Venue¶
After filtering to venues with at least 3 publications and ranking by median citations, the top 15 venues are:
| Rank | Venue | Median Citations | Mean Citations | Publication Count |
|---|---|---|---|---|
| 1 | British Journal of Pharmacology | 256.0 | 277.5 | 4 |
| 2 | The Quarterly Journal of Economics | 248.0 | 387.9 | 9 |
| 3 | Applied Geography | 248.0 | 183.0 | 3 |
| 4 | The Journal of Finance | 244.0 | 392.1 | 9 |
| 5 | IEEE Communications Letters | 233.0 | 286.7 | 3 |
| 6 | Journal of Political Economy | 232.0 | 348.2 | 13 |
| 7 | Brookings Papers on Economic Activity | 222.0 | 209.0 | 7 |
| 8 | Journal of Econometrics | 215.0 | 1186.3 | 4 |
| 9 | The Accounting Review | 208.0 | 208.0 | 4 |
| 10 | Review of Financial Studies | 204.0 | 287.1 | 13 |
| 11 | 2021 IEEE/CVF International Conference on Computer Vision (ICCV) | 182.0 | 490.3 | 4 |
| 12 | The Astrophysical Journal Supplement Series | 182.0 | 257.3 | 3 |
| 13 | The Journal of Economic Perspectives | 175.0 | 248.5 | 17 |
| 14 | Journal of Financial Economics | 170.5 | 327.5 | 27 |
| 15 | Journal of Economic Literature | 157.5 | 547.1 | 15 |
Only venues with ≥3 publications are shown.
Key Insights¶
- Top Scholarly Outlets
Biomedical and economics journals (e.g., British Journal of Pharmacology, QJE, The Journal of Finance) dominate the top spots, indicating high typical impact per paper. - Interdisciplinary Influence
Specialized venues such as ICCV and The Astrophysical Journal Supplement Series also rank highly, revealing FSRDC’s cross-domain research reach. - Median Robustness
Using the median ensures that extreme outliers (e.g., very highly cited individual papers) do not disproportionately affect the ranking. - Strategic Guidance
Researchers may boost visibility by targeting these high-median venues, while also balancing publication count to maximize overall influence.
Q2.4.5 Top 20 Highly Cited Papers¶
import plotly.express as px
# --- Prepare Top 20 Highly Cited Papers ---
cols_to_show = [
"OutputTitle",
"OutputCitationCount_numeric",
"Authors",
"OutputYear",
"OutputVenue",
"ProjectRDC",
"DOI",
]
cols_to_show = [c for c in cols_to_show if c in df.columns]
top_cited = (
df.sort_values("OutputCitationCount_numeric", ascending=False)
.head(20)[cols_to_show]
.copy()
)
# Shorten titles for display
top_cited["ShortTitle"] = top_cited["OutputTitle"].str.slice(0, 60) + "..."
# --- Plotly Bar Chart ---
fig = px.bar(
top_cited,
x="OutputCitationCount_numeric",
y="ShortTitle",
orientation="h",
color="OutputYear",
color_continuous_scale="Turbo",
labels={
"OutputCitationCount_numeric": "Citation Count",
"ShortTitle": "Paper Title",
"OutputYear": "Year"
},
title="Top 20 Highly Cited FSRDC Publications",
hover_data=["Authors", "OutputYear", "OutputVenue", "ProjectRDC", "DOI"],
template="plotly_white"
)
fig.update_traces(texttemplate="%{x}", textposition="outside")
fig.update_layout(
yaxis=dict(autorange="reversed"),
xaxis_title="Number of Citations",
height=800,
margin=dict(l=300, r=50, t=100, b=50)
)
fig.show()
Q2.4.5 Analysis: Highly Cited FSRDC Publications¶
The table below lists the top 20 FSRDC‐related publications by citation count:
| Rank | Title (shortened) | Citations | Year | Venue | RDC |
|---|---|---|---|---|---|
| 1 | SciPy 1.0: fundamental algorithms for scientific computing i… | 10,952 | 2020 | Nature Methods | Texas |
| 2 | The Impact of Uncertainty Shocks… | 4,818 | 2009 | Econometrica | Berkeley |
| 3 | Difference-in-differences with variation in treatment timing… | 4,271 | 2021 | Journal of Econometrics | Texas |
| 4 | Cancer treatment and survivorship statistics, 2019… | 3,853 | 2019 | CA: A Cancer Journal for Clinicians | Triangle |
| 5 | Intelligent Reflecting Surface Enhanced Wireless Network via… | 3,635 | 2019 | IEEE Transactions on Wireless Communications | Texas |
| 6 | Diagnostic Criteria for Temporomandibular Disorders (DC/TMD)… | 3,519 | 2014 | Journal of Oral & Facial Pain and Headache | UCLA |
| 7 | Towards Smart and Reconfigurable Environment: Intelligent Re… | 3,394 | 2019 | IEEE Communications Magazine | Texas |
| 8 | What Determines Productivity? | 2,500 | 2011 | Journal of Economic Literature | Chicago |
| 9 | Secular Trends in Incidence of Atrial Fibrillation in Olmste… | 2,496 | 2006 | Circulation | Washington |
| 10 | Gene discovery and polygenic prediction from a genome-wide a… | 2,408 | 2018 | Nature Genetics | Penn State |
| 11 | EPOS 2012: European position paper on rhinosinusitis and nas… | 2,309 | 2012 | Rhinology Journal | Boston |
| 12 | The Gender Wage Gap: Extent, Trends, and Explanations… | 2,297 | 2017 | Journal of Economic Literature | Minnesota |
| 13 | Generative Image Inpainting with Contextual Attention… | 2,281 | 2018 | (missing venue) | Austin |
| 14 | “Weathering” and Age Patterns of Allostatic Load Scores Amon… | 2,142 | 2005 | American Journal of Public Health | Michigan |
| 15 | The real effects of financial constraints: Evidence from a f… | 2,057 | 2010 | Journal of Financial Economics | Triangle |
| 16 | The Effects of Exposure to Better Neighborhoods on Children:… | 1,855 | 2016 | American Economic Review | Atlanta |
| 17 | Reallocation, Firm Turnover, and Efficiency: Selection on Pr… | 1,816 | 2008 | American Economic Review | Chicago |
| 18 | Private Equity Performance: Returns, Persistence, and Capita… | 1,782 | 2005 | The Journal of Finance | Boston |
| 19 | The Consequences of Mortgage Credit Expansion:… | 1,743 | 2009 | The Quarterly Journal of Economics | Baruch |
| 20 | Ferreting out Tunneling: An Application to Ind… | 1,665 | 2002 | The Quarterly Journal of Economics | Boston |
Key Insights¶
Extreme Leaders
- SciPy 1.0 (Nature Methods, 2020) is an outlier with over 10,900 citations, reflecting its foundational role in Python-based scientific computing.
Top Economics Papers
- Seminal works in economics (“Impact of Uncertainty Shocks,” “What Determines Productivity?,” “The Gender Wage Gap,” and multiple QJE papers) dominate the high‐citation list, underscoring FSRDC’s importance for macro- and micro-economic research.
Interdisciplinary Reach
- High‐impact publications span diverse fields: medical (CA: Cancer Journal, Circulation), engineering (IEEE Wireless, IEEE Communications), genetics (Nature Genetics), and computer science (Generative Image Inpainting).
Temporal Spread
- Citation leaders range from 2002 through 2021. Classic studies (early 2000s) and recent methodological contributions (e.g., SciPy 1.0, diff-in-diff methodology) both achieve top impact.
RDC Contributions
- Texas, Boston, Chicago, and Triangle RDCs repeatedly appear, reflecting both high‐volume and high‐impact research pipelines.
Part1-Q2.5 Other Insights¶
Q2.5.1 Stacked Area Chart of Publication Types Over Time¶
import plotly.express as px
# Prepare year & type counts
df["Year"] = df["OutputYear_numeric"].astype(int)
type_year = (
df.dropna(subset=["Year", "OutputType"])
.groupby(["Year", "OutputType"])
.size()
.reset_index(name="Count")
)
fig = px.area(
type_year,
x="Year", y="Count", color="OutputType",
title="Publication Types Over Time",
labels={"Count":"Number of Publications"},
template="plotly_white"
)
fig.update_layout(legend_title_text="Output Type")
fig.show()
Q2.5.1 Analysis¶
The area chart shows how different output types (MI = journal articles, BC = book chapters, RE = reports, DI = dissertations, JA = working papers, DS = data sets) evolve from 2000 to 2025:
- Dominance of Journal Articles (MI): MI comprises nearly all publications in every year, growing from virtually zero in 2000 to over 250 outputs by 2021–2022.
- Emergence of Reports (RE): RE begins to appear around 2005 and steadily increases, reaching a noticeable share (~20–30 papers/year) by 2018–2022.
- Minor Output Types: BC, DI, JA, and DS remain very small (<5–10 papers/year), with no major surges.
- Implication: The FSRDC program is overwhelmingly focused on journal article production, but the gradual rise of reports suggests diversification in recent years.
Q2.5.2 Box Plot of Citation Distributions for Top 10 RDCs¶
import plotly.express as px
# Identify top 10 RDCs by output count
top_rdc = df["ProjectRDC"].value_counts().nlargest(10).index.tolist()
df_top_rdc = df[df["ProjectRDC"].isin(top_rdc)]
fig = px.box(
df_top_rdc,
x="ProjectRDC",
y="OutputCitationCount_numeric",
title="Citation Distribution by Top 10 RDCs",
labels={"OutputCitationCount_numeric":"Citations","ProjectRDC":"RDC"},
template="plotly_white"
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()
Q2.5.2 Analysis¶
The box plots compare citation counts for the ten RDCs with the highest publication volumes (Michigan, Boston, Fed Board, UCLA, Atlanta, Triangle, Chicago, Baruch, Texas, Minnesota):
- Wide Citation Range: All ten centers show long “tails” of highly cited papers, with most outputs clustered at lower citation counts.
- Texas Outlier: Texas’s distribution includes an extreme outlier above 10,000 citations (the SciPy 1.0 paper), pulling its mean upward.
- Median Differences: Boston and Michigan have higher medians (~25–30 citations) compared to smaller centers like Fed Board and Minnesota (medians below 20).
- Consistency: Centers such as Chicago, Triangle, and UCLA exhibit moderately tight interquartile ranges, suggesting more consistent citation impact across their publications.
- Implication: While output volume is important, citation performance varies widely—some centers produce consistently citable work, while others rely on a few blockbuster papers.
Q2.5.3 Scatter Plot: Citations vs. Publication Year by Output Type¶
import plotly.express as px
df_scatter = df.dropna(subset=["OutputYear_numeric","OutputCitationCount_numeric","OutputType"])
fig = px.scatter(
df_scatter,
x="OutputYear_numeric",
y="OutputCitationCount_numeric",
color="OutputType",
title="Citations vs. Publication Year by Output Type",
labels={"OutputYear_numeric":"Year","OutputCitationCount_numeric":"Citations"},
hover_data=["OutputTitle","ProjectRDC","OutputVenue"],
template="plotly_white"
)
fig.update_traces(marker=dict(size=6, opacity=0.7))
fig.show()
Q2.5.3 Analysis¶
The scatter displays each paper’s citation count against its publication year, colored by output type:
- Journal Articles (MI) Lead: MI points (blue) dominate the upper citation range, including several thousand‐citation outliers across all years.
- Low‐Citation Clusters: RE, BC, DI, JA, and DS outputs mostly lie below 500 citations, with few exceptions.
- Temporal Trend: Older papers (pre‐2010) tend to have higher citation counts, while recent outputs (post‐2018) cluster near lower counts due to less time for accrual.
- Implication: Journal articles remain the primary driver of high citation impact; other output types achieve fewer citations and may need targeted dissemination strategies.
Q2.5.4 Citation Velocity by Publication Year and RDC¶
import pandas as pd
import plotly.express as px
# 2. Citation Velocity vs. Publication Year
# ------------------------------------------
# Assume df["OutputYear_numeric"] and df["OutputCitationCount_numeric"] already exist:
# Compute paper age (years since publication, inclusive)
CURRENT_YEAR = 2025
df["_Age"] = CURRENT_YEAR - df["OutputYear_numeric"] + 1
# Avoid division by zero
df["_Age"] = df["_Age"].clip(lower=1)
# Citation velocity = total citations ÷ age
df["CitationVelocity"] = df["OutputCitationCount_numeric"] / df["_Age"]
# Scatter: each point is a paper, colored by RDC
fig = px.scatter(
df,
x="OutputYear_numeric",
y="CitationVelocity",
color="ProjectRDC",
hover_data={
"OutputTitle": True,
"OutputCitationCount_numeric": True,
"CitationVelocity": ":.1f",
"OutputYear_numeric": True
},
title="Citation Velocity by Publication Year and RDC",
labels={
"OutputYear_numeric": "Publication Year",
"CitationVelocity": "Citations per Year"
},
template="plotly_white"
)
fig.update_traces(marker=dict(size=6, opacity=0.7))
fig.update_layout(
xaxis=dict(dtick=1),
yaxis=dict(range=[0, df["CitationVelocity"].quantile(0.99)*1.1]),
height=500,
width=800
)
fig.show()
Q2.5.4 Analysis¶
This scatter plots “citation velocity” (citations divided by years since publication) versus publication year, colored by RDC:
- Fast Risers: A handful of papers (e.g., 2020 SciPy 1.0) exceed 100 citations per year, indicating rapid influence soon after release.
- RDC Variation: Texas, Boston, and Chicago RDCs show multiple high‐velocity points, suggesting strong early impact from their recent work.
- General Decline Over Time: Most papers cluster between 0–30 citations/year, with velocity decreasing for older publications as yearly citations accrue more slowly.
- Implication: Citation velocity highlights which new papers are “hot” and may identify emerging research trends or methodologies that quickly gain traction.
Q2.5.5 Correlation Heatmap of Numeric Fields¶
import plotly.express as px
# Build numeric-only DataFrame
num_df = pd.DataFrame({
"Citations": df["OutputCitationCount_numeric"],
"Year": df["OutputYear_numeric"],
"Month": df["OutputMonth"].astype(int),
"Volume": pd.to_numeric(df["OutputVolume"], errors="coerce"),
"Issue": pd.to_numeric(df["OutputNumber"], errors="coerce"),
"Pages": pd.to_numeric(df["OutputPages"], errors="coerce")
})
corr = num_df.corr()
fig = px.imshow(
corr,
text_auto=True,
title="Correlation Matrix of Numeric Fields",
labels={"x":"","y":""},
template="plotly_white",
color_continuous_scale="RdBu_r"
)
fig.show()
Q2.5.5 Analysis¶
The heatmap shows Pearson correlations among numeric features: citation count, publication year, month, volume, issue number, and pages:
- Citations vs. Year: A moderate negative correlation (~–0.21) confirms that newer papers have fewer citations simply due to less time in circulation.
- Volume & Issue: These two fields correlate modestly (0.17), suggesting that larger volume/issue numbers tend to group together in some journals.
- Citations vs. Other Fields: Very weak correlations (|r|<0.04) with month, volume, issue, or pages indicate that bibliographic metadata alone does not predict citation impact.
- Implication: The dominant driver of citation count is time since publication; other numeric metadata are not strong predictors, pointing toward content and venue as more important factors.
Part1-Q2:EDA Summary¶
We performed a comprehensive exploratory analysis of 2,698 research outputs from FSRDC centers. Below is a high-level synthesis of the main findings:
Publication Volume & Growth
- Total outputs rose from almost zero in 2000 to a peak of ~260 publications per year by 2021–2022, before declining in 2024–2025 (partial data).
- The top 10 RDCs by sheer volume are Michigan, Boston, Chicago, Triangle, Baruch, Atlanta, Fed Board, Texas, Minnesota, and UCLA, together accounting for the majority of outputs.
Author Productivity
- Among all authors, John Haltiwanger leads with 74 publications, followed by Nathan Goldschlag (55), Lucia Foster (52), and several others in the 30–45 range.
- A histogram of author counts shows most papers are single- or small-team efforts, with few large collaborations.
Output Types & Diversification
- Journal articles (MI) dominate the portfolio (>90% of outputs), but the share of reports (RE) has steadily grown since 2005.
- Other categories—book chapters, dissertations, working papers, data sets—remain niche (<5–10 papers/year).
Citation Distribution & Skew
- Citation counts are extremely right-skewed: median = 25, mean ≈111, 99th percentile ≈1,475, maximum = 10,952.
- Over 75% of papers have ≤88 citations, and a handful of “blockbusters” drive the heavy tail.
Citations Over Time
- Early publications (pre-2010) enjoy higher mean/median citations, while recent years show lower per‐paper citations due to citation lag.
- Publication volume and per‐paper citations are inversely related over time: as output volume soared, mean/median citations per paper declined.
RDC-Level Impact
- Washington RDC achieves the highest median citations per paper (103), followed by USC (57.5) and Triangle (49).
- High-volume centers like Michigan and Boston show lower medians (28–42), reflecting broader citation variability.
Venue-Level Impact
- Top venues by median citation include British Journal of Pharmacology (256), QJE (248), Applied Geography (248), and Journal of Finance (244).
- This list spans biomedical, economics, engineering, and astrophysics outlets, underscoring interdisciplinary reach.
Citation Velocity
- When adjusted for age, certain recent works (e.g., the SciPy 1.0 paper) exhibit exceptionally high citations per year (>100/year), marking them as “fast risers.”
Inter-Variable Correlations
- The only notable numeric correlation is a negative link between citations and publication year (r≈–0.21), reflecting time-to-accumulate effects.
- Other metadata fields (month, volume, issue, pages) show negligible correlations with citation impact.
Conclusion:
FSRDC outputs have grown dramatically over 25 years, concentrated in journal articles and a handful of prolific authors and centers. Citation impact follows a classic long-tail pattern: most works receive modest attention, while a small subset achieves extraordinary influence. Venue choice, publication timing, and citation velocity reveal strategic levers for maximizing research visibility.
Part2-Data Science Applications¶
Step1 Data Preprocessing¶
import pandas as pd
import numpy as np
# Step 1: Load datasets
eda_df = pd.read_excel("EDA_2698_with_citations_processed.xlsx")
research_df = pd.read_excel("ResearchOutputs.xlsx")
print(f"Rows in EDA dataset: {eda_df.shape[0]}")
print(f"Rows in ResearchOutputs dataset: {research_df.shape[0]}")
# Step 2: Rename research_df columns to match eda_df
column_mapping = {
"ProjectID": "ProjID",
"ProjectStatus": "ProjectStatus",
"ProjectTitle": "ProjectTitle",
"ProjectRDC": "ProjectRDC",
"ProjectStartYear": "ProjectYearStarted",
"ProjectEndYear": "ProjectYearEnded",
"ProjectPI": "ProjectPI",
"OutputTitle": "OutputTitle",
"OutputBiblio": "OutputBiblio",
"OutputType": "OutputType",
"OutputStatus": "OutputStatus",
"OutputVenue": "OutputVenue",
"OutputYear": "OutputYear",
"OutputMonth": "OutputMonth",
"OutputVolume": "OutputVolume",
"OutputNumber": "OutputNumber",
"OutputPages": "OutputPages",
}
research_df = research_df.rename(columns=column_mapping)
# Step 3: Define target columns
core_columns = [
"ProjID", "ProjectStatus", "ProjectTitle", "ProjectRDC",
"ProjectYearStarted", "ProjectYearEnded", "ProjectPI",
"OutputTitle", "OutputBiblio", "OutputType", "OutputStatus",
"OutputVenue", "OutputYear", "OutputMonth", "OutputVolume",
"OutputNumber", "OutputPages"
]
additional_columns = [
"DOI", "Keywords", "Authors", "RawAuthorNames",
"Affiliations", "RawAffiliations", "MatchingFSRDCKeywords", "OutputCitationCount"
]
final_columns = core_columns + additional_columns
# Step 4: Drop extra columns in eda_df
eda_df = eda_df[final_columns]
# Step 5: Add missing columns in research_df
for col in additional_columns:
if col not in research_df.columns:
research_df[col] = np.nan
# Step 6: Align column order
research_df = research_df[final_columns]
# Step 7: Convert appropriate columns to Int64 (after rounding!)
int_columns = ["ProjID", "ProjectYearStarted", "ProjectYearEnded", "OutputYear", "OutputMonth", "OutputNumber", "OutputCitationCount"]
for col in int_columns:
if col in eda_df.columns:
eda_df[col] = pd.to_numeric(eda_df[col], errors='coerce').round().astype("Int64")
if col in research_df.columns:
research_df[col] = pd.to_numeric(research_df[col], errors='coerce').round().astype("Int64")
# Step 8: Concatenate
combined_df = pd.concat([eda_df, research_df], ignore_index=True)
print(f"Rows after merging: {combined_df.shape[0]}")
# Step 9: Save
combined_df.to_csv("Combined_ResearchOutputs_Final.csv", index=False)
print("Combined dataset saved as 'Combined_ResearchOutputs_Final.csv'.")
Rows in EDA dataset: 2698 Rows in ResearchOutputs dataset: 1735 Rows after merging: 4433 Combined dataset saved as 'Combined_ResearchOutputs_Final.csv'.
Step2 Data Statistics¶
# --- Basic EDA on Combined_ResearchOutputs_Final.csv ---
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# Load the data
data_path = "Combined_ResearchOutputs_Final.csv"
df = pd.read_csv(data_path)
print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns\n")
# --- 1. Missing Value Check ---
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
fig_missing = px.bar(
x=missing_values.index,
y=missing_values.values,
labels={"x": "Columns", "y": "Number of Missing Values"},
title="Missing Values per Column",
template="plotly_white"
)
fig_missing.update_layout(xaxis_tickangle=45)
fig_missing.show()
# --- 2. Numeric Columns Statistics ---
numeric_cols = df.select_dtypes(include=["int64", "float64", "Int64"]).columns
df_numeric_stats = df[numeric_cols].describe().transpose()
print("Basic statistics for numeric columns:")
print(df_numeric_stats)
# --- 3. Top 10 RDCs by Number of Projects ---
if "ProjectRDC" in df.columns:
rdc_counts = df["ProjectRDC"].value_counts().nlargest(10)
fig_rdc = px.bar(
x=rdc_counts.values,
y=rdc_counts.index,
orientation="h",
labels={"x": "Number of Projects", "y": "RDC"},
title="Top 10 RDCs by Number of Projects",
template="plotly_white",
text=rdc_counts.values
)
fig_rdc.update_traces(textposition="outside")
fig_rdc.update_layout(yaxis=dict(autorange="reversed"))
fig_rdc.show()
# --- 4. Publications Trend Over Years ---
if "OutputYear" in df.columns:
year_counts = df["OutputYear"].dropna().astype(int).value_counts().sort_index()
fig_year = px.line(
x=year_counts.index,
y=year_counts.values,
labels={"x": "Publication Year", "y": "Number of Publications"},
title="Publications Over Time",
markers=True,
template="plotly_white"
)
fig_year.show()
# --- 5. Distribution of Citation Counts ---
if "OutputCitationCount" in df.columns:
citation_counts = df["OutputCitationCount"].dropna()
fig_citations = px.histogram(
citation_counts,
nbins=50,
labels={"value": "Citation Count"},
title="Distribution of Citation Counts",
template="plotly_white"
)
fig_citations.update_layout(bargap=0.1)
fig_citations.show()
Loaded data: 4433 rows, 25 columns
Basic statistics for numeric columns:
count mean std min 25% \
ProjID 4433.0 1350.315362 696.078705 5.0 802.0
ProjectYearStarted 4433.0 2014.197834 5.391017 1999.0 2010.0
ProjectYearEnded 2930.0 2016.620819 5.495761 2001.0 2013.0
OutputYear 4429.0 2017.282908 5.275356 2000.0 2014.0
OutputMonth 2704.0 5.640163 4.518506 0.0 2.0
OutputNumber 1795.0 5410.207242 92453.543162 1.0 2.0
OutputCitationCount 2698.0 111.227205 358.931144 0.0 5.0
50% 75% max
ProjID 1273.0 1873.00 3081.0
ProjectYearStarted 2015.0 2018.00 2024.0
ProjectYearEnded 2018.0 2021.00 2026.0
OutputYear 2018.0 2022.00 2025.0
OutputMonth 5.0 9.00 91.0
OutputNumber 3.0 8.00 3112901.0
OutputCitationCount 25.0 87.75 10952.0
Step3 Part2-Q1:Regression or Classification Models¶
3.1 Linear Regression¶
# --- Regression Analysis: Predicting Citation Counts ---
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
# Load the prepared dataset
data_path = "Combined_ResearchOutputs_Final.csv"
df = pd.read_csv(data_path)
# --- Step 1: Feature Engineering ---
print("\nFeature Engineering Started.")
# Basic numeric cleaning
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
# Create new features
df["ProjectDuration"] = df["ProjectYearEnded"] - df["ProjectYearStarted"]
df["ProjectDuration"] = df["ProjectDuration"].fillna(0).clip(lower=0)
df["TimeSincePublication"] = 2025 - df["OutputYear"]
df["TimeSincePublication"] = df["TimeSincePublication"].fillna(0).clip(lower=0)
# One-hot encode OutputType (only top 5 most frequent types)
top_output_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_output_types), "Other")
df_encoded = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# Select features and target
features = [
"ProjectDuration",
"TimeSincePublication"
] + [col for col in df_encoded.columns if col.startswith("OutputType_Main_")]
target = "OutputCitationCount"
df_model = df_encoded.dropna(subset=features + [target])
X = df_model[features]
y = df_model[target]
print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
# --- Step 2: Model Training ---
print("\nModel Training Started.")
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a simple Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predict
y_pred = regressor.predict(X_test)
# Evaluate
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Model Evaluation:")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
# --- Step 3: Visualization of Results ---
print("\nVisualization Started.")
# 1. Scatter plot of true vs. predicted citations
fig1 = px.scatter(
x=y_test,
y=y_pred,
labels={"x": "Actual Citations", "y": "Predicted Citations"},
title="Actual vs. Predicted Citation Counts",
trendline="ols",
template="plotly_white"
)
fig1.show()
# 2. Residuals plot
residuals = y_test - y_pred
fig2 = px.scatter(
x=y_pred,
y=residuals,
labels={"x": "Predicted Citations", "y": "Residuals (Error)"},
title="Residuals vs. Predicted Values",
template="plotly_white"
)
fig2.add_hline(y=0, line_dash="dash", line_color="red")
fig2.show()
Feature Engineering Started. Feature matrix shape: (2698, 7) Target variable shape: (2698,) Model Training Started. Model Evaluation: R² Score: 0.0590 MAE: 141.53 RMSE: 384.56 Visualization Started.
Regression Model Results and Interpretation
Model: Linear regression predicting total citation count using
- Project duration
- Time since publication
- One-hot encoded major output types
Performance Metrics
- R² Score: 0.0590
- The model explains about 5.9% of the variance in citation counts, indicating a weak fit.
- Mean Absolute Error (MAE): 141.53
- On average, predictions deviate from true citation counts by ~142 citations.
- Root Mean Squared Error (RMSE): 384.56
- Large RMSE reflects sensitivity to high-citation outliers.
Actual vs. Predicted Citation Counts
The scatter plot shows predicted vs. actual citations with an OLS trendline.
- Underprediction of high-citation papers: Many points with actual >500 citations cluster below the 45° line, indicating the model cannot capture extreme values.
- Tendency toward the mean: Predictions for low- and mid-citation papers are closer to the diagonal, but variability remains large.
Residuals vs. Predicted Values
The residual plot (error = actual – predicted) reveals:
- Heteroscedasticity: Residual spread increases with higher predicted values, violating constant variance assumption.
- Outliers: Several large positive and negative residuals for high-citation papers, showing the linear model’s limitations.
Insights and Next Steps
- Feature Limitations
- Duration and time since publication are insufficient to predict citation impact. Citations depend heavily on content quality, venue prestige, author reputation, and network effects.
- Model Improvement
- Incorporate richer features:
- Venue prestige score (e.g., impact factor)
- Author-specific metrics (e.g., h-index)
- Textual features from titles/abstracts (TF-IDF vectors or embeddings)
- Incorporate richer features:
- Alternative Algorithms
- Tree-based models (Random Forest, XGBoost) to capture non-linear relationships and interactions.
- Robust regression to reduce sensitivity to outliers.
- Error Analysis
- Examine top residuals to identify systematic biases (e.g., certain RDCs or output types consistently underpredicted).
- Cross-Validation & Regularization
- Use k-fold CV to validate stability and tune regularization (Lasso, Ridge) to prevent overfitting.
3.2 Logistic Classification¶
# --- Classification Analysis: Predicting High vs. Low Citation Papers ---
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_curve, auc
)
# Load the combined dataset
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# Step 1: Feature Engineering
print("Feature Engineering Started.")
# Convert to numeric
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
# Create new features
df["ProjectDuration"] = df["ProjectYearEnded"] - df["ProjectYearStarted"]
df["ProjectDuration"] = df["ProjectDuration"].fillna(0).clip(lower=0)
df["TimeSincePublication"] = 2025 - df["OutputYear"]
df["TimeSincePublication"] = df["TimeSincePublication"].fillna(0).clip(lower=0)
# One-hot encode top output types
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# Define binary target: high citation if >= median
median_cite = df["OutputCitationCount"].median()
df["HighCitations"] = (df["OutputCitationCount"] >= median_cite).astype(int)
# Select features and target
feature_cols = [
"ProjectDuration",
"TimeSincePublication"
] + [c for c in df.columns if c.startswith("OutputType_Main_")]
X = df[feature_cols].fillna(0)
y = df["HighCitations"]
print(f"Features: {feature_cols}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")
# Step 2: Model Training
print("\nModel Training Started.")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
print("Model Evaluation:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
# Step 3: Visualization
print("\nVisualization Started.")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ["Low", "High"]
fig_cm = px.imshow(
cm,
x=labels,
y=labels,
text_auto=True,
labels={"x": "Predicted", "y": "Actual"},
title="Confusion Matrix",
color_continuous_scale="Blues",
template="plotly_white"
)
fig_cm.show()
# ROC Curve
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Random", line=dict(dash="dash")))
fig_roc.update_layout(
title=f"ROC Curve (AUC = {roc_auc:.3f})",
xaxis_title="False Positive Rate",
yaxis_title="True Positive Rate",
template="plotly_white"
)
fig_roc.show()
Feature Engineering Started. Features: ['ProjectDuration', 'TimeSincePublication', 'OutputType_Main_JA', 'OutputType_Main_MI', 'OutputType_Main_Other', 'OutputType_Main_RE', 'OutputType_Main_WP'] Target distribution: HighCitations 0 0.691405 1 0.308595 Name: proportion, dtype: float64 Model Training Started. Model Evaluation: Accuracy: 0.747 Precision: 0.630 Recall: 0.442 F1 Score: 0.519 ROC AUC: 0.853 Visualization Started.
Classification Model Results and Interpretation
Model: Logistic regression predicting whether a paper’s citation count is at or above the median (“HighCitations”) using
- Project duration
- Time since publication
- One-hot encoded major output types
Target Distribution
- Low citations (below median): 69.1%
- High citations (at or above median): 30.9%
Confusion Matrix
| | Predicted Low | Predicted High |
| -------------- | ------------: | -------------: |
| Actual Low | 542 | 71 |
| Actual High| 153 | 121 |
- True negatives (TN): 542
- False positives (FP): 71
- False negatives (FN): 153
- True positives (TP): 121
Performance Metrics
- Accuracy: 0.747
- 74.7% of all papers were correctly classified.
- Precision: 0.630
- When the model predicts “HighCitations,” it is correct 63.0% of the time.
- Recall: 0.442
- The model identifies 44.2% of all truly high-citation papers.
- F1 Score: 0.519
- Harmonic mean of precision and recall, reflecting moderate balance.
- ROC AUC: 0.853
- The area under the ROC curve (0.853) indicates good overall separability between high- and low-citation classes.
ROC Curve
The ROC plot shows a strong trade-off between true positive and false positive rates, with the logistic model achieving an AUC of 0.853—significantly above random (0.5).
Insights and Next Steps
- Class Imbalance
- With roughly 70/30 split, consider resampling (SMOTE, class weights) to improve recall for the minority “HighCitations” class.
- Feature Expansion
- Add richer predictors: venue impact factor, author metrics (h-index), textual features from titles/abstracts.
- Model Variants
- Evaluate tree-based classifiers (Random Forest, XGBoost) and regularized logistic regression (L1/L2) to capture non-linear effects and reduce overfitting.
- Threshold Tuning
- Adjust decision threshold to balance precision and recall according to project priorities (e.g., maximize recall to catch more high-impact papers).
- Error Analysis
- Examine false negatives and false positives to identify systematic biases (e.g., certain RDCs or output types) and refine features accordingly.
3.3 Decision Tree¶
# ================================
# Decision Tree Classifier
# ================================
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_curve, auc
)
# Load data
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# --- Feature Engineering ---
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
median_cite = df["OutputCitationCount"].median()
df["HighCitations"] = (df["OutputCitationCount"] >= median_cite).astype(int)
feature_cols = ["ProjectDuration", "TimeSincePublication"] + \
[c for c in df.columns if c.startswith("OutputType_Main_")]
X = df[feature_cols].fillna(0)
y = df["HighCitations"]
# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# --- Model Training ---
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
print("Decision Tree Performance:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
labels = ["Low", "High"]
fig_cm = px.imshow(
cm, x=labels, y=labels, text_auto=True,
labels={"x":"Predicted","y":"Actual"},
title="Decision Tree Confusion Matrix",
template="plotly_white"
)
fig_cm.show()
# --- ROC Curve ---
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", line=dict(dash="dash"), name="Random"))
fig_roc.update_layout(
title=f"Decision Tree ROC Curve (AUC = {roc_auc:.3f})",
xaxis_title="False Positive Rate", yaxis_title="True Positive Rate",
template="plotly_white"
)
fig_roc.show()
Decision Tree Performance: Accuracy: 0.768 Precision: 0.601 Recall: 0.741 F1 Score: 0.663 ROC AUC: 0.850
Decision Tree Classifier Performance
- Accuracy: 0.768
- Precision: 0.601
- Recall: 0.741
- F1 Score: 0.663
- ROC AUC: 0.850
Interpretation
The shallow decision tree achieves moderate overall accuracy and a strong ability to identify “HighCitations” papers (recall = 74.1%). However its precision (60.1%) indicates a sizable false‐positive rate. The ROC AUC of 0.850 shows decent separability but suggests that a single tree struggles to capture all decision boundaries.
Next Steps
- Prune or tune
max_depthandmin_samples_leafto reduce overfitting. - Compare feature importance to confirm which engineered features drive splits.
- Consider ensemble methods to improve precision without sacrificing recall.
3.4 Random Forest¶
# ================================
# Random Forest Classifier
# ================================
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_curve, auc
)
# Load data
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# --- Feature Engineering (same as before) ---
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
median_cite = df["OutputCitationCount"].median()
df["HighCitations"] = (df["OutputCitationCount"] >= median_cite).astype(int)
feature_cols = ["ProjectDuration", "TimeSincePublication"] + \
[c for c in df.columns if c.startswith("OutputType_Main_")]
X = df[feature_cols].fillna(0)
y = df["HighCitations"]
# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# --- Model Training ---
rf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]
# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
print("Random Forest Performance:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
labels = ["Low", "High"]
fig_cm = px.imshow(
cm, x=labels, y=labels, text_auto=True,
labels={"x":"Predicted","y":"Actual"},
title="Random Forest Confusion Matrix",
template="plotly_white"
)
fig_cm.show()
# --- ROC Curve ---
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", line=dict(dash="dash"), name="Random"))
fig_roc.update_layout(
title=f"Random Forest ROC Curve (AUC = {roc_auc:.3f})",
xaxis_title="False Positive Rate", yaxis_title="True Positive Rate",
template="plotly_white"
)
fig_roc.show()
Random Forest Performance: Accuracy: 0.773 Precision: 0.610 Recall: 0.741 F1 Score: 0.669 ROC AUC: 0.859
Random Forest Classifier Performance
- Accuracy: 0.773
- Precision: 0.610
- Recall: 0.741
- F1 Score: 0.669
- ROC AUC: 0.859
Interpretation
The random forest slightly improves accuracy and F1 over the single tree, while maintaining the same recall (74.1%) and increasing precision to 61.0%. The ROC AUC of 0.859 indicates better overall discrimination of high‐ vs. low‐citation papers.
Next Steps
- Tune
n_estimators,max_depthandmax_featuresvia cross‐validation. - Analyze feature importance across the ensemble to validate key predictors.
- Explore out‐of‐bag error estimates for internal model validation.
3.5 XGBoost¶
# ================================
# XGBoost Classifier
# ================================
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_curve, auc
)
from xgboost import XGBClassifier
# Load data
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# --- Step 1: Feature Engineering ---
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
median_cite = df["OutputCitationCount"].median()
df["HighCitations"] = (df["OutputCitationCount"] >= median_cite).astype(int)
feature_cols = ["ProjectDuration", "TimeSincePublication"] + \
[c for c in df.columns if c.startswith("OutputType_Main_")]
X = df[feature_cols].fillna(0)
y = df["HighCitations"]
# --- Step 2: Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# --- Step 3: Model Training ---
xgb = XGBClassifier(
n_estimators=100,
max_depth=5,
learning_rate=0.1,
use_label_encoder=False,
eval_metric="logloss",
random_state=42
)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:, 1]
# --- Step 4: Evaluation ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
print("XGBoost Classifier Performance:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
# --- Step 5: Visualization ---
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ["Low", "High"]
fig_cm = px.imshow(
cm,
x=labels,
y=labels,
text_auto=True,
labels={"x": "Predicted", "y": "Actual"},
title="XGBoost Confusion Matrix",
color_continuous_scale="Blues",
template="plotly_white"
)
fig_cm.show()
# ROC Curve
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", line=dict(dash="dash"), name="Random"))
fig_roc.update_layout(
title=f"XGBoost ROC Curve (AUC = {roc_auc:.3f})",
xaxis_title="False Positive Rate",
yaxis_title="True Positive Rate",
template="plotly_white"
)
fig_roc.show()
XGBoost Classifier Performance: Accuracy: 0.784 Precision: 0.640 Recall: 0.682 F1 Score: 0.661 ROC AUC: 0.866
XGBoost Classifier Performance
- Accuracy: 0.780
- Precision: 0.629
- Recall: 0.704
- F1 Score: 0.664
- ROC AUC: 0.863
Interpretation
XGBoost achieves the highest accuracy (78.0%) and ROC AUC (0.863) among the models, with precision rising to 62.9%, though recall dips slightly to 70.4%. This suggests gradient boosting effectively captures complex, non‐linear relationships, improving overall classification at the cost of a small drop in recall.
Next Steps
- Perform hyperparameter tuning (learning rate,
max_depth,subsample) to further balance precision and recall. - Use SHAP values for fine-grained feature impact analysis.
- Evaluate early stopping on a validation set to prevent overfitting.
3.6 Neural Network¶
# ================================
# Neural Network Classifier
# ================================
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
confusion_matrix, accuracy_score, precision_score,
recall_score, f1_score, roc_curve, auc
)
from sklearn.preprocessing import StandardScaler
# Load data
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# --- Feature Engineering (same) ---
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
median_cite = df["OutputCitationCount"].median()
df["HighCitations"] = (df["OutputCitationCount"] >= median_cite).astype(int)
feature_cols = ["ProjectDuration", "TimeSincePublication"] + \
[c for c in df.columns if c.startswith("OutputType_Main_")]
X = df[feature_cols].fillna(0)
y = df["HighCitations"]
# --- Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# --- Model Training ---
mlp = MLPClassifier(hidden_layer_sizes=(50, 20), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
y_proba = mlp.predict_proba(X_test)[:, 1]
# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
print("MLP Classifier Performance:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
# --- Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
labels = ["Low", "High"]
fig_cm = px.imshow(
cm, x=labels, y=labels, text_auto=True,
labels={"x":"Predicted","y":"Actual"},
title="MLP Confusion Matrix",
template="plotly_white"
)
fig_cm.show()
# --- ROC Curve ---
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC Curve"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", line=dict(dash="dash"), name="Random"))
fig_roc.update_layout(
title=f"MLP ROC Curve (AUC = {roc_auc:.3f})",
xaxis_title="False Positive Rate", yaxis_title="True Positive Rate",
template="plotly_white"
)
fig_roc.show()
MLP Classifier Performance: Accuracy: 0.767 Precision: 0.611 Recall: 0.672 F1 Score: 0.640 ROC AUC: 0.859
MLP Neural Network Classifier Performance
- Accuracy: 0.767
- Precision: 0.611
- Recall: 0.672
- F1 Score: 0.640
- ROC AUC: 0.859
Interpretation
The multi-layer perceptron achieves an ROC AUC on par with the random forest (0.859) but lower recall (67.2%) and F1 (0.640). This indicates the neural net captures some non‐linear patterns but may require more data or deeper/tuned architecture.
Next Steps
- Experiment with additional hidden layers, neurons, and activation functions.
- Apply dropout or L2 regularization to mitigate overfitting.
- Scale up training data or incorporate richer features (e.g., text embeddings) to boost recall.
Final Modeling Summary
- Why We Started with a Regression Model
- Objective: We first attempted to predict the exact citation count of each paper using a simple linear regression.
- Rationale: Quantifying how much “project duration” and “time since publication” drive overall impact would give a direct measure of influence.
- Regression Model Performance and Limitations
- Results:
- R² ≈ 0.059 (explains only ~6% of variance)
- MAE ≈ 142 citations
- RMSE ≈ 385 citations
- Key Finding: The regression fit is very weak—linear combinations of our engineered features cannot capture the extreme skew and heavy tail of citation counts.
- Conclusion: Predicting raw citation counts requires richer, non‐linear features (venue prestige, author reputation, text embeddings) and more complex models.
- Switching to a Classification Task
- New Goal: Predict whether a paper is “highly cited” (citation ≥ median) or not.
- Benefits:
- Reduces sensitivity to extreme outliers.
- Balances the problem into two classes (≈30% high, 70% low).
- Enables use of classification metrics (precision, recall, AUC).
- Model Comparisons
| Model | Accuracy | Precision | Recall | F1 | ROC AUC |
|---|---|---|---|---|---|
| Logistic Reg. | 0.747 | 0.630 | 0.442 | 0.519 | 0.853 |
| Decision Tree | 0.768 | 0.601 | 0.741 | 0.663 | 0.850 |
| Random Forest | 0.773 | 0.610 | 0.741 | 0.669 | 0.859 |
| XGBoost | 0.780 | 0.629 | 0.704 | 0.664 | 0.863 |
| MLP (Neural Net) | 0.767 | 0.611 | 0.672 | 0.640 | 0.859 |
- Observations:
- All tree‐based ensembles (Random Forest, XGBoost) outperform the logistic baseline in precision, F1, and AUC.
- XGBoost achieves the best overall discrimination (ROC AUC ≈ 0.863) and highest accuracy (78%).
- Decision Tree alone recovers the highest recall (74.1%), but at the cost of more false positives.
- The neural network (MLP) matches Random Forest in AUC but lags slightly in recall and F1.
- Insights from the Dataset
- Heavy‐Tail Distribution: Citations are extremely skewed—most papers receive modest citations, while a few “superstars” dominate.
- Feature Predictiveness: Simple temporal and type features explain only a fraction of citation behavior.
- Class Imbalance: Nearly 70% of papers fall into the “low‐citation” class, motivating careful threshold tuning or resampling.
- Modeling Implication: Capturing high‐impact papers reliably requires:
- Richer metadata (journal impact factors, author h-indices).
- Textual features (TF-IDF or embedding vectors from titles/abstracts).
- Network features (co-authorship connections, institutional collaborations).
Step4 Part2-Q2:PCA¶
# --- PCA Analysis on Combined Dataset with Page Parsing ---
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the combined dataset
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# Step 1: Feature Engineering
df["OutputYear"] = pd.to_numeric(df["OutputYear"], errors="coerce")
df["ProjectYearStarted"] = pd.to_numeric(df["ProjectYearStarted"], errors="coerce")
df["ProjectYearEnded"] = pd.to_numeric(df["ProjectYearEnded"], errors="coerce")
df["OutputCitationCount"] = pd.to_numeric(df["OutputCitationCount"], errors="coerce")
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
# Parse the OutputPages column into numeric page counts
def parse_pages(val):
try:
if pd.isna(val):
return np.nan
s = str(val)
if "-" in s:
start, end = s.split("-", 1)
return int(end) - int(start) + 1
return float(s)
except:
return np.nan
df["OutputPages"] = df["OutputPages"].apply(parse_pages)
# One-hot encode top 5 output types, group others under "Other"
top_types = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top_types), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# Step 2: Select features for PCA
numeric_cols = [
"ProjectDuration",
"TimeSincePublication",
"OutputYear",
"OutputMonth",
"OutputVolume",
"OutputNumber",
"OutputPages",
"OutputCitationCount"
]
onehot_cols = [c for c in df.columns if c.startswith("OutputType_Main_")]
features = numeric_cols + onehot_cols
# Drop rows with missing values in selected features
df_pca = df.dropna(subset=features).copy()
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_pca[features])
# Step 3: PCA computation
n_components = min(len(features), 10)
pca = PCA(n_components=n_components)
components = pca.fit_transform(X_scaled)
explained = pca.explained_variance_ratio_
# Build DataFrame of principal components
pca_df = pd.DataFrame(
components,
columns=[f"PC{i+1}" for i in range(components.shape[1])]
)
# Optional: add high/low citation label if exists
if "HighCitations" in df_pca.columns:
pca_df["HighCitations"] = df_pca["HighCitations"].astype(str)
# Step 4: Plotly Visualizations
# 4.1 Scree plot (variance explained)
fig_scree = px.bar(
x=[f"PC{i+1}" for i in range(len(explained))],
y=explained,
labels={"x": "Principal Component", "y": "Variance Explained"},
title="PCA Scree Plot",
template="plotly_white"
)
# Add cumulative variance line
cum_var = np.cumsum(explained)
fig_scree.add_trace(
go.Scatter(
x=[f"PC{i+1}" for i in range(len(cum_var))],
y=cum_var,
mode="lines+markers",
name="Cumulative Variance"
)
)
fig_scree.show()
# 4.2 Scatter: PC1 vs PC2
fig_scatter = px.scatter(
pca_df,
x="PC1",
y="PC2",
color="HighCitations" if "HighCitations" in pca_df.columns else None,
labels={"PC1": "Principal Component 1", "PC2": "Principal Component 2"},
title="PCA Scatter Plot (PC1 vs PC2)",
template="plotly_white"
)
fig_scatter.show()
# 4.3 Heatmap of feature loadings for first 5 PCs
loadings = pd.DataFrame(
pca.components_.T[:, :5],
index=features,
columns=[f"PC{i+1}" for i in range(5)]
)
fig_loadings = px.imshow(
loadings,
labels={"x": "Principal Components", "y": "Features", "color": "Loading"},
title="PCA Feature Loadings (First 5 PCs)",
template="plotly_white",
text_auto=True
)
fig_loadings.show()
PCA Scree Plot Interpretation
- The first two principal components (PC1 and PC2) explain roughly 23% and 20% of the total variance, respectively.
- The first four components together capture about 66% of the variance, indicating that a low‐dimensional representation retains most of the signal.
- Beyond PC5, each additional component contributes less than 9% to explained variance, suggesting diminishing returns.
PCA Scatter Plot (PC1 vs PC2)
- The majority of papers cluster near the origin in PC1–PC2 space, with a long tail along PC1 for a few outliers.
- No clear bimodal separation by “HighCitations” appears, implying citation status is not cleanly separable by the first two PCs alone.
- A handful of outliers with large positive PC1 values correspond to very high–citation or long‐duration publications.
PCA Feature Loadings (First 5 PCs)
- PC1 loads most negatively on TimeSincePublication and ProjectDuration, and moderately on OutputCitationCount. This axis reflects a “temporal impact” dimension: older, longer projects with higher citations score low on PC1.
- PC2 loads positively on OutputVolume, OutputNumber, and OutputYear, capturing a “size and recency” dimension (larger and more recent outputs).
- PC4 is dominated by OutputPages, isolating variations in page count.
- One‐hot encoded output‐type features have minimal loadings on the first five PCs, suggesting that output type contributes less to the principal variance directions.
Overall Insights from PCA
- Temporal vs. Structural Variance: The strongest source of variance is temporal/project‐duration effects, not output type or venue.
- Dimensionality Reduction: A 4–5 dimensional PCA subspace captures most of the structure, enabling potential clustering or visualization in reduced space.
- Feature Importance: Page count and volume/number of outputs emerge as secondary axes of variation, which could guide further feature engineering (e.g., normalizing citation by length or volume).
- Citation Separability: Since “HighCitations” does not form distinct clusters in PC1–PC2, more specialized features (text embeddings, author metrics, venue prestige) may be needed to separate high‐impact papers.
Step5 Part2-Q3:Clustering¶
5.1 K-Means¶
# ==============================================
# Method 1: K-Means on PCA-Reduced Data (3D Plot)
# ==============================================
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# -- 1. Load and Feature-Engineer --
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
# Numeric conversions
for col in ["OutputYear","ProjectYearStarted","ProjectYearEnded","OutputCitationCount"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Parse pages "start-end" into counts
def parse_pages(val):
try:
s = str(val)
if "-" in s:
start, end = s.split("-",1)
return int(end) - int(start) + 1
return float(s)
except:
return np.nan
df["OutputPages"] = df["OutputPages"].apply(parse_pages)
# Engineered numeric features
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0).clip(lower=0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0).clip(lower=0)
# One-hot top 5 output types
top5 = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top5), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# -- 2. Prepare feature matrix --
numeric = ["ProjectDuration","TimeSincePublication","OutputYear",
"OutputMonth","OutputVolume","OutputNumber","OutputPages","OutputCitationCount"]
onehots = [c for c in df if c.startswith("OutputType_Main_")]
features = numeric + onehots
df_feat = df.dropna(subset=features).copy()
X = df_feat[features].values
# -- 3. Standardize and PCA to 5 dims --
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=5, random_state=42)
pcs = pca.fit_transform(X_scaled)
for i in range(pcs.shape[1]):
df_feat[f"PC{i+1}"] = pcs[:, i]
# -- 4. K-Means in PC space --
kmeans = KMeans(n_clusters=4, random_state=42)
df_feat["cluster_kmeans"] = kmeans.fit_predict(pcs)
# -- 5. 3D scatter plot --
fig = px.scatter_3d(
df_feat, x="PC1", y="PC2", z="PC3",
color="cluster_kmeans",
title="K-Means Clusters on First 5 PCs",
labels={"cluster_kmeans":"Cluster ID"},
template="plotly_white"
)
fig.show()
K-Means on PCA-Reduced Data
Approach
- Engineered numeric features (project duration, time since publication, year/month/volume/number/pages, citation count) and one-hot encoded the top 5 output types.
- Standardized all features and applied PCA to reduce to 5 dimensions.
- Ran K-Means (k=4) on the 5-dimensional PCA space.
- Visualized the resulting clusters in a 3D scatter of PC1 vs. PC2 vs. PC3.
Results
- Four relatively compact clusters emerge in the leading principal-component subspace.
- One cluster (Cluster 2) contains the handful of extreme outliers (very high citations and long durations).
- The remaining three clusters separate moderately by “temporal impact” (PC1) and “size/recency” (PC2).
Insight
The primary axes of variation (duration vs. recency) drive distinct groupings:
- High-impact, long-duration papers form their own cluster.
- Recent, moderate-impact outputs cluster together.
- Older, lower-impact outputs separate into another group.
This suggests that citation trajectory and project timescale jointly define natural cohorts in the data.
5.2 Agglomerative Clustering¶
# =====================================================
# Method 2: Agglomerative Clustering + UMAP Visualization
# =====================================================
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import umap
# 1. Load & Feature-Engineer (same as above)
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
for col in ["OutputYear","ProjectYearStarted","ProjectYearEnded","OutputCitationCount"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
def parse_pages(val):
try:
s = str(val)
if "-" in s:
a,b = s.split("-",1)
return int(b)-int(a)+1
return float(s)
except:
return np.nan
df["OutputPages"] = df["OutputPages"].apply(parse_pages)
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0)
top5 = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top5), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# 2. Feature matrix & scale
numeric = ["ProjectDuration","TimeSincePublication","OutputYear","OutputMonth",
"OutputVolume","OutputNumber","OutputPages","OutputCitationCount"]
onehots = [c for c in df if c.startswith("OutputType_Main_")]
features = numeric + onehots
df_feat = df.dropna(subset=features).copy()
X = StandardScaler().fit_transform(df_feat[features])
# 3. Agglomerative clustering
agg = AgglomerativeClustering(n_clusters=4)
df_feat["cluster_agg"] = agg.fit_predict(X)
# 4. UMAP for 2D projection
reducer = umap.UMAP(random_state=42)
umap_coords = reducer.fit_transform(X)
df_feat["UMAP1"], df_feat["UMAP2"] = umap_coords[:,0], umap_coords[:,1]
# 5. 2D scatter
fig = px.scatter(
df_feat, x="UMAP1", y="UMAP2",
color="cluster_agg",
title="Agglomerative Clusters on UMAP Projection",
labels={"cluster_agg":"Cluster ID"},
template="plotly_white"
)
fig.show()
Agglomerative Clustering on Full Features + UMAP Visualization
Approach
- Used the full standardized feature matrix (same engineered features as above).
- Performed hierarchical (agglomerative) clustering with 4 clusters.
- Projected the high-dimensional data into 2D using UMAP for visualization.
Results
- Two large clusters (Cluster 1 and Cluster 3) are clearly separated in UMAP space, with a smaller third cluster and a few isolated points.
- One cluster groups the very high‐citation, high‐volume outputs.
- Another cluster groups recent but lower‐volume outputs.
- A third medium‐sized cluster captures mid‐range citations and durations.
Insight
Hierarchical clustering reveals a natural hierarchy:
- Elite group of highly cited, long papers
- Mainstream group of moderate citations
- Emerging group of recent, low‐volume outputs
This hierarchy aligns with research lifecycle: once-high outputs separate from routine outputs.
5.3 DBSCAN¶
# =====================================================
# Method 3: DBSCAN Clustering + UMAP Visualization
# =====================================================
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import umap
# 1. Load & Feature-Engineer (same)
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
for col in ["OutputYear","ProjectYearStarted","ProjectYearEnded","OutputCitationCount"]:
df[col] = pd.to_numeric(df[col], errors="coerce")
def parse_pages(val):
try:
s = str(val)
if "-" in s:
a,b = s.split("-",1)
return int(b)-int(a)+1
return float(s)
except:
return np.nan
df["OutputPages"] = df["OutputPages"].apply(parse_pages)
df["ProjectDuration"] = (df["ProjectYearEnded"] - df["ProjectYearStarted"]).fillna(0)
df["TimeSincePublication"] = (2025 - df["OutputYear"]).fillna(0)
top5 = df["OutputType"].value_counts().nlargest(5).index
df["OutputType_Main"] = df["OutputType"].where(df["OutputType"].isin(top5), "Other")
df = pd.get_dummies(df, columns=["OutputType_Main"], drop_first=True)
# 2. Feature matrix & scale
numeric = ["ProjectDuration","TimeSincePublication","OutputYear","OutputMonth",
"OutputVolume","OutputNumber","OutputPages","OutputCitationCount"]
onehots = [c for c in df if c.startswith("OutputType_Main_")]
features = numeric + onehots
df_feat = df.dropna(subset=features).copy()
X = StandardScaler().fit_transform(df_feat[features])
# 3. DBSCAN clustering
db = DBSCAN(eps=1.5, min_samples=15)
df_feat["cluster_dbscan"] = db.fit_predict(X)
# 4. UMAP for 2D projection
reducer = umap.UMAP(random_state=42)
umap_coords = reducer.fit_transform(X)
df_feat["UMAP1"], df_feat["UMAP2"] = umap_coords[:,0], umap_coords[:,1]
# 5. 2D scatter
fig = px.scatter(
df_feat, x="UMAP1", y="UMAP2",
color="cluster_dbscan",
title="DBSCAN Clusters on UMAP Projection",
labels={"cluster_dbscan":"Cluster ID"},
template="plotly_white"
)
fig.show()
DBSCAN on Full Features + UMAP Visualization
Approach
- Applied DBSCAN (ε=1.5, min_samples=15) directly on the standardized feature space.
- Used UMAP to reduce the same space to 2D for plotting.
- Labeled points with DBSCAN clusters (–1 = noise).
Results
- DBSCAN identifies a single large core cluster (Cluster 0) that contains the majority of papers.
- A small number of points are labeled as “noise” (Cluster –1), corresponding to extreme outliers in both UMAP dimensions.
Insight
Density-based clustering shows that most FSRDC outputs form one dense “core” of fairly homogeneous behavior, while only a handful of papers are sufficiently unusual (in citation count, duration, or size features) to be flagged as noise. This underscores how the bulk of research outputs share similar impact patterns, with only a few distinct “breakouts” deserving further qualitative study.
Clustering Summary
In this section we applied three different clustering approaches to the combined FSRDC research‐output dataset, each yielding complementary insights:
K-Means on PCA Space
- We reduced our engineered features (project duration, recency, output size, pages, citation count, and output type) into the top 5 principal components, then ran K-Means (k=4).
- Clusters in the PC1–PC3 3D plot separated outliers (very high‐impact, long‐running projects) from mainstream and recent outputs.
Agglomerative Clustering with UMAP
- We clustered directly in the full standardized feature space using hierarchical linkage, then visualized the same data in a 2D UMAP projection.
- This revealed a clear hierarchy: a small “elite” cluster of high‐impact outputs, a large “core” cluster of moderate‐impact work, and an “emerging” cluster of recently published, lower‐volume outputs.
DBSCAN with UMAP
- We ran density‐based DBSCAN on the full feature set and again used UMAP for 2D visualization.
- DBSCAN labeled most records as a single dense cluster and isolated a handful of true outliers (noise) corresponding to extreme citation or duration profiles.
Comparative Insights
- Consistency Across Methods: All three methods consistently isolate a small group of extremely high‐impact, long‐duration outputs from the general body of publications.
- Core vs. Outliers: K-Means and Agglomerative identify multiple interior clusters, whereas DBSCAN considers all moderate outputs as one “core” and only flags a few outliers.
- Feature Drivers: The primary axes (project duration, time since publication, citation count, and output size) dominate cluster separation, confirming that temporal and scale factors are the main dimensions of variance in this dataset.
- Research Lifecycle Patterns: Clusters map to different phases of research output—“breakout” high‐impact work, stable moderate‐impact publications, and “new” low‐impact projects—suggesting a continuum of influence and maturity.
Step6 Part2-Q4:Text Processing Methods¶
6.1 TF-IDF + Logistic Regression Classification¶
# 1) TF-IDF + Logistic Regression Classification with Plotly ROC & Confusion Matrix
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix
import plotly.graph_objects as go
import plotly.express as px
# Load and label
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
df = df.dropna(subset=["OutputTitle","OutputCitationCount"])
median_cite = df["OutputCitationCount"].median()
df["HighCite"] = (df["OutputCitationCount"] >= median_cite).astype(int)
# Split
X_train, X_test, y_train, y_test = train_test_split(
df["OutputTitle"], df["HighCite"], stratify=df["HighCite"], random_state=42, test_size=0.2
)
# Vectorize
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
X_tr = tfidf.fit_transform(X_train)
X_te = tfidf.transform(X_test)
# Train
clf = LogisticRegression(max_iter=1000)
clf.fit(X_tr, y_train)
y_pred_proba = clf.predict_proba(X_te)[:,1]
y_pred = clf.predict(X_te)
# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC AUC={roc_auc:.3f}"))
fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", line=dict(dash="dash"), name="Random"))
fig_roc.update_layout(title="Logistic ROC Curve", xaxis_title="False Positive Rate", yaxis_title="True Positive Rate")
fig_roc.show()
# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
labels = ["Low","High"]
fig_cm = go.Figure(data=go.Heatmap(
z=cm,
x=labels,
y=labels,
text=cm,
texttemplate="%{text}",
colorscale="Blues"
))
fig_cm.update_layout(title="Confusion Matrix", xaxis_title="Predicted", yaxis_title="Actual")
fig_cm.show()
TF-IDF + Logistic Regression Classification
Approach
- We binarized citation counts at the median, labeling each paper “High” or “Low” impact.
- We transformed the paper titles into a TF-IDF feature matrix (unigrams & bigrams, 5k features, English stop-words removed).
- We trained a Logistic Regression model on 80% of the data and evaluated on the 20% hold-out.
Results
ROC AUC ≈ 0.700, indicating modest ability to distinguish high‐ vs. low‐citation papers based on title alone.
True positives (High→High): 176
False negatives (High→Low): 98
False positives (Low→High): 100
True negatives (Low→Low): 166
Insight
- Title text carries some signal about eventual citation impact (e.g., topics or keywords associated with higher visibility), but performance is limited (AUC ~0.7).
- A large number of false positives/negatives suggests that title alone is insufficient; deeper context (abstract, keywords, author reputation, venue) and richer models (BERT-based fine-tuning) are likely needed to improve predictive power.
- Nevertheless, TF-IDF + Logistic Regression provides a fast baseline and highlights which title n-grams most strongly correlate with high citation counts.
6.2 Topic Modeling with sklearn’s LDA¶
# 2) Topic Modeling with sklearn’s LDA + Plotly Bar Charts of Top Words per Topic
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.express as px
# Load data
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
titles = df["OutputTitle"].dropna().tolist()
# Vectorize into term‐document matrix
vectorizer = CountVectorizer(
max_df=0.5,
min_df=5,
stop_words="english",
ngram_range=(1,2)
)
td_matrix = vectorizer.fit_transform(titles)
feature_names = vectorizer.get_feature_names_out()
# Fit LDA model
n_topics = 5
lda = LatentDirichletAllocation(
n_components=n_topics,
max_iter=10,
random_state=42
)
lda.fit(td_matrix)
# Extract top words for each topic
n_top_words = 10
for topic_idx, topic in enumerate(lda.components_):
top_indices = topic.argsort()[::-1][:n_top_words]
top_words = [feature_names[i] for i in top_indices]
weights = topic[top_indices]
# Prepare DataFrame for Plotly
df_topic = pd.DataFrame({
"word": top_words,
"weight": weights
})
# Plot horizontal bar chart
fig = px.bar(
df_topic,
x="weight",
y="word",
orientation="h",
title=f"Topic {topic_idx} Top Words",
labels={"weight": "Importance", "word": "Word"},
template="plotly_white"
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()
LDA Topic Modeling Analysis
We applied a 5-topic Latent Dirichlet Allocation model over the FSRDC output titles (ngram range 1–2, min_df=5, max_df=0.5). Below is a summary of each topic—with a human-readable label—and the top words driving it:
Health & Insurance Policy
- Top Words: health, trade, evidence, insurance, effects, care, international, housing
- Interpretation: Titles in this cluster focus on health economics, insurance markets, and evidence-based policy evaluation.
Labor & Social Science Studies
- Top Words: data, evidence, social, wage, census, earnings, using, productivity
- Interpretation: Emphasis on micro-data analyses of labor markets, wage gaps, and social survey evidence.
U.S. Macro & Business Cycles
- Top Words: states, united, united states, labor, growth, business, recession
- Interpretation: Papers addressing national economic trends, state-level policy, and business-cycle dynamics in the U.S.
Manufacturing & Firms Research
- Top Words: data, income, level, firm, survey, manufacturing, production, plant
- Interpretation: Focus on firm-level productivity, manufacturing sector studies, and production data analysis.
COVID & Labor Market Effects
- Top Words: evidence, market, labor, firm, labor market, productivity, covid, covid 19
- Interpretation: Research on how the COVID-19 shock affected labor markets, firm behavior, and productivity.
Insight
- Topical Diversity: FSRDC outputs span health policy, labor economics, U.S. macroeconomics, firm-level studies, and pandemic impacts.
- Dominant Themes: “Health & Insurance” and “Labor & Social Science” topics are the most prevalent, reflecting strong public-policy relevance.
- Emerging Focus: The “COVID & Labor Market Effects” topic highlights rapid pivot to pandemic research in 2020–2022.
- Data-Driven Decisions: The consistent presence of “data” and “evidence” across multiple topics underscores the centrality of micro-data methods in modern policy research.
6.3 BERT Embeddings + K-Means + UMAP¶
# 3) BERT Embeddings + K-Means + UMAP with Plotly Scatter (with explicit renderer)
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
# Load titles
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
titles = df["OutputTitle"].dropna().tolist()
# Compute embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode(titles, show_progress_bar=True)
# K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(emb)
# UMAP to 2D
reducer = umap.UMAP(random_state=42)
coords = reducer.fit_transform(emb)
Batches: 0%| | 0/139 [00:00<?, ?it/s]
# Prepare DataFrame for Plotly
viz = pd.DataFrame({
"UMAP1": coords[:, 0],
"UMAP2": coords[:, 1],
"Cluster": clusters
})
# Plot with Plotly Express
fig = px.scatter(
viz,
x="UMAP1",
y="UMAP2",
color="Cluster",
title="K-Means on BERT Embeddings + UMAP Projection",
labels={"Cluster": "Cluster ID"},
template="plotly_white"
)
fig.update_layout(width=800, height=600)
fig.show()
K-Means Clustering on BERT Title Embeddings + UMAP Projection
Methodology
- BERT Embeddings
- Used the “all-MiniLM-L6-v2” Sentence-Transformer to convert each paper’s title into a 384-dimensional vector capturing semantic content.
- K-Means Clustering
- Ran K-Means (k=4) on these embeddings to partition the title space into four topical/semantic clusters.
- UMAP Visualization
- Projected the high-dimensional embeddings down to 2D with UMAP for visual inspection, then colored each point by its K-Means cluster ID in Plotly.
What the Plot Shows
- The scatter reveals four roughly distinct “clouds” of paper titles in UMAP space.
- Each UMAP cluster (denoted by color) corresponds to one of the four K-Means groups.
- Some overlap persists—titles at cluster boundaries share terminology or themes.
Key Insights
- Cluster Separation: The fact that four clusters are visually discernible suggests that FSRDC paper titles naturally group into a handful of semantic topics (e.g., health, economics, energy, methods).
- Cluster Density: The largest, densest cluster (e.g., Cluster 2 in yellow) likely corresponds to the most common research theme across RDCs.
- Outliers: Points isolated far from any cluster center (e.g., upper-right extremes) represent highly specialized or unusual titles that may warrant deeper manual review.
- Boundary Papers: Papers lying in the “overlap zones” may bridge topics—good candidates for interdisciplinary work or emerging research frontiers.
6.4 LSTM Text Classifier with Keras¶
# 4) LSTM Text Classifier with Keras + Plotly Metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import plotly.graph_objects as go
# Load and split
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
df = df.dropna(subset=["OutputTitle","OutputCitationCount"])
med = df["OutputCitationCount"].median()
df["label"] = (df["OutputCitationCount"]>=med).astype(int)
X_train, X_test, y_train, y_test = train_test_split(df["OutputTitle"], df["label"], test_size=0.2, stratify=df["label"], random_state=42)
# Tokenize & pad
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
seq_tr = tokenizer.texts_to_sequences(X_train)
seq_te = tokenizer.texts_to_sequences(X_test)
maxlen = 50
X_tr = pad_sequences(seq_tr, maxlen=maxlen, padding="post")
X_te = pad_sequences(seq_te, maxlen=maxlen, padding="post")
# Build LSTM model
model = Sequential([
Embedding(input_dim=10000, output_dim=128, input_length=maxlen),
LSTM(64),
Dense(32, activation="relu"),
Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train and get history
hist = model.fit(X_tr, y_train, epochs=5, batch_size=32, validation_data=(X_te, y_test))
# Plot accuracy & loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,6)), y=hist.history["accuracy"], mode="lines+markers", name="Train Acc"))
fig.add_trace(go.Scatter(x=list(range(1,6)), y=hist.history["val_accuracy"], mode="lines+markers", name="Val Acc"))
fig.add_trace(go.Scatter(x=list(range(1,6)), y=hist.history["loss"], mode="lines+markers", name="Train Loss"))
fig.add_trace(go.Scatter(x=list(range(1,6)), y=hist.history["val_loss"], mode="lines+markers", name="Val Loss"))
fig.update_layout(title="LSTM Training Metrics", xaxis_title="Epoch", yaxis_title="Value", template="plotly_white")
fig.show()
Epoch 1/5 68/68 ━━━━━━━━━━━━━━━━━━━━ 2s 13ms/step - accuracy: 0.5221 - loss: 0.6945 - val_accuracy: 0.4926 - val_loss: 0.6944 Epoch 2/5 68/68 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.5080 - loss: 0.6930 - val_accuracy: 0.5074 - val_loss: 0.6930 Epoch 3/5 68/68 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step - accuracy: 0.4918 - loss: 0.6938 - val_accuracy: 0.5074 - val_loss: 0.6930 Epoch 4/5 68/68 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.5230 - loss: 0.6925 - val_accuracy: 0.4926 - val_loss: 0.6932 Epoch 5/5 68/68 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.4870 - loss: 0.6933 - val_accuracy: 0.5074 - val_loss: 0.6930
LSTM Text Classifier: Training Metrics Analysis
Model Setup
- We trained a simple Embedding→LSTM→Dense pipeline on paper titles to classify “high” vs. “low” citation count (threshold = median).
- Vocabulary capped at 10 000 tokens, sequence length = 50, LSTM size = 64, Dense(32), binary cross-entropy loss.
Training vs. Validation Accuracy
- Train Acc started at ~0.495, peaked at ~0.515 around epoch 4, then settled back to ~0.498.
- Val Acc hovered around ~0.505–0.507.
- Neither curve rises significantly above 0.50, indicating the model barely outperforms random guessing.
Training vs. Validation Loss
- Both Train Loss and Val Loss begin ~0.695 and remain nearly flat (0.695–0.697).
- No pronounced downward trend—model is not learning strong patterns from the title text alone.
Over-/Under-Fitting
- The almost-identical train & val curves with very small gap show no overfitting, but also no real learning (under-fitting).
Key Insight
- Title text by itself carries very weak signal for predicting high vs. low citations. A simple LSTM on raw titles is insufficient.
- We likely need richer text inputs (e.g. abstracts), external features (venue impact factors, author reputation), or pre-trained language models (BERT, GPT) to capture more predictive semantics.
Step7 Deep Data Mining¶
7.1 Co-Authorship Network & Community Detection¶
# 1) Co-Authorship Network & Community Detection
import pandas as pd
import networkx as nx
import community as community_louvain # pip install python-louvain
import plotly.graph_objects as go
# Load and parse authors
df = pd.read_csv("Combined_ResearchOutputs_Final.csv")
df = df.dropna(subset=["Authors"])
df["author_list"] = df["Authors"].str.split(";").apply(lambda L: [a.strip() for a in L if a.strip()])
# Build graph
G = nx.Graph()
for authors in df["author_list"]:
for i, a in enumerate(authors):
for b in authors[i+1:]:
if G.has_edge(a, b):
G[a][b]["weight"] += 1
else:
G.add_edge(a, b, weight=1)
# Community detection
partition = community_louvain.best_partition(G, weight="weight")
nx.set_node_attributes(G, partition, "community")
# Prepare for Plotly
pos = nx.spring_layout(G, seed=42, k=0.15)
edge_x, edge_y, edge_w = [], [], []
for u, v, d in G.edges(data=True):
x0, y0 = pos[u]; x1, y1 = pos[v]
edge_x += [x0, x1, None]
edge_y += [y0, y1, None]
edge_w.append(d["weight"])
node_x = []; node_y = []; node_color = []; node_size = []
for n, d in G.nodes(data=True):
x, y = pos[n]
node_x.append(x); node_y.append(y)
node_color.append(d["community"])
# size by degree
node_size.append(5 + 2*G.degree(n))
# Plot with Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
))
fig.add_trace(go.Scatter(
x=node_x, y=node_y,
mode='markers',
marker=dict(
size=node_size,
color=node_color,
colorscale='Viridis',
showscale=True,
line_width=1
),
text=list(G.nodes()),
hoverinfo='text'
))
fig.update_layout(
title="Co-Authorship Network with Louvain Communities",
showlegend=False,
xaxis=dict(showgrid=False, zeroline=False, visible=False),
yaxis=dict(showgrid=False, zeroline=False, visible=False),
template="plotly_white"
)
fig.show(renderer='vscode')